1 /* $Id: seqdbcommon.cpp 631511 2021-05-19 13:47:54Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file seqdbcommon.cpp
31 /// Definitions of various helper functions for SeqDB.
32 #include <ncbi_pch.hpp>
33 #include <corelib/metareg.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
37 #include <util/sequtil/sequtil.hpp>
38 #include <util/sequtil/sequtil_convert.hpp>
39 #include <objects/seq/seq__.hpp>
40 #include <objects/general/general__.hpp>
41 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
42 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
43 #include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
44 #include <algorithm>
45
46 BEGIN_NCBI_SCOPE
47
48 const string kSeqDBGroupAliasFileName("index.alx");
49
SeqDB_RemoveDirName(CSeqDB_Substring s)50 CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
51 {
52 int off = s.FindLastOf(CFile::GetPathSeparator());
53
54 if (off != -1) {
55 s.EraseFront(off + 1);
56 }
57
58 return s;
59 }
60
61
SeqDB_RemoveFileName(CSeqDB_Substring s)62 CSeqDB_Substring SeqDB_RemoveFileName(CSeqDB_Substring s)
63 {
64 int off = s.FindLastOf(CFile::GetPathSeparator());
65
66 if (off != -1) {
67 s.Resize(off);
68 } else {
69 s.Clear();
70 }
71
72 return s;
73 }
74
75
SeqDB_RemoveExtn(CSeqDB_Substring s)76 CSeqDB_Substring SeqDB_RemoveExtn(CSeqDB_Substring s)
77 {
78 // This used to remove anything after the last "." it could find.
79 // Then it was changed to only remove the part after the ".", if
80 // it did not contain a "/" character.
81
82 // Now it has been made even stricter, it looks for something like
83 // "(.*)([.][a-zA-Z]{3})" and removes the second sub-expression if
84 // there is a match. This is because of mismatches like "1234.00"
85 // that are not real "file extensions" in the way that SeqDB wants
86 // to process them.
87
88 int slen = s.Size();
89
90 if (slen > 4) {
91 string extn(s.GetEnd()-4, s.GetEnd());
92 string extn2(extn, 2, 4);
93 // Of course, nal and pal are not the only valid
94 // extensions, but this code is only used with these two,
95 // as far as I know, at this moment in time.
96
97 if (extn[0] == '.' &&
98 (extn[1] == 'n' || extn[1] == 'p') &&
99 (extn2 == "al" || extn2 == "in")) {
100 /*
101 isalpha( s[slen-3] ) &&
102 isalpha( s[slen-2] ) &&
103 isalpha( s[slen-1] )) {*/
104
105 s.Resize(slen - 4);
106 }
107 }
108
109 return s;
110 }
111
112
SeqDB_SplitString(CSeqDB_Substring & buffer,CSeqDB_Substring & front,char delim)113 bool SeqDB_SplitString(CSeqDB_Substring & buffer,
114 CSeqDB_Substring & front,
115 char delim)
116 {
117 for(int i = 0; i < buffer.Size(); i++) {
118 if (buffer[i] == delim) {
119 front = buffer;
120
121 buffer.EraseFront(i + 1);
122 front.Resize(i);
123
124 return true;
125 }
126 }
127 return false;
128 }
129
130
SeqDB_CombinePath(const CSeqDB_Substring & one,const CSeqDB_Substring & two,const CSeqDB_Substring * extn,string & outp)131 void SeqDB_CombinePath(const CSeqDB_Substring & one,
132 const CSeqDB_Substring & two,
133 const CSeqDB_Substring * extn,
134 string & outp)
135 {
136 char delim = CFile::GetPathSeparator();
137
138 int extn_amt = extn ? (extn->Size()+1) : 0;
139
140 if (two.Empty()) {
141 // We only use the extension if there is a filename.
142 one.GetString(outp);
143 return;
144 }
145
146 bool only_two = false;
147
148 if (one.Empty() || two[0] == delim) {
149 only_two = true;
150 }
151
152 // Drive letter test for CP/M derived systems
153 if (delim == '\\' &&
154 two.Size() > 3 &&
155 isalpha(two[0]) &&
156 two[1] == ':' &&
157 two[2] == '\\') {
158
159 only_two = true;
160 }
161
162 if (only_two) {
163 outp.reserve(two.Size() + extn_amt);
164 two.GetString(outp);
165
166 if (extn) {
167 outp.append(".");
168 outp.append(extn->GetBegin(), extn->GetEnd());
169 }
170 return;
171 }
172
173 outp.reserve(one.Size() + two.Size() + 1 + extn_amt);
174
175 one.GetString(outp);
176
177 if (outp[outp.size() - 1] != delim) {
178 outp += delim;
179 }
180
181 outp.append(two.GetBegin(), two.GetEnd());
182
183 if (extn) {
184 outp.append(".");
185 outp.append(extn->GetBegin(), extn->GetEnd());
186 }
187 }
188
189
SeqDB_CompareVolume(const string & s1,const string & s2)190 bool SeqDB_CompareVolume(const string & s1, const string & s2)
191 {
192 string x1, x2;
193 CSeqDB_Path(s1).FindBaseName().GetString(x1);
194 CSeqDB_Path(s2).FindBaseName().GetString(x2);
195 if (x1 != x2) return (x1 < x2);
196 else return (s1 < s2);
197 }
198
199 /// File existence test interface.
200 class CSeqDB_FileExistence {
201 public:
202 /// Destructor
~CSeqDB_FileExistence()203 virtual ~CSeqDB_FileExistence()
204 {
205 }
206
207 /// Check if file exists at fully qualified path.
208 /// @param fname Filename.
209 /// @return True if the file was found.
210 virtual bool DoesFileExist(const string & fname) = 0;
211 };
212
213
214 /// Test whether an index or alias file exists
215 ///
216 /// The provide filename is combined with both of the extensions
217 /// appropriate to the database sequence type, and the resulting
218 /// strings are checked for existence in the file system. The
219 /// 'access' object defines how to check file existence.
220 ///
221 /// @param dbname
222 /// Input path and filename
223 /// @param dbtype
224 /// Database type, either protein or nucleotide
225 /// @param access
226 /// The file access object.
227 /// @param linkoutdb_search
228 /// Determines whether linkoutdb files should be searched for
229 /// @return
230 /// true if either of the index or alias files is found
231
s_SeqDB_DBExists(const string & dbname,char dbtype,CSeqDB_FileExistence & access,bool linkoutdb_search)232 static bool s_SeqDB_DBExists(const string & dbname,
233 char dbtype,
234 CSeqDB_FileExistence & access,
235 bool linkoutdb_search)
236 {
237 string path;
238 path.reserve(dbname.size() + 4);
239 path.assign(dbname.data(), dbname.data() + dbname.size());
240
241 if (linkoutdb_search) {
242 _ASSERT(dbtype == 'p');
243 path.append(".sqlite3");
244 if (access.DoesFileExist(path)) {
245 return true;
246 }
247 } else {
248 path.append(".-al");
249
250 path[path.size()-3] = dbtype;
251
252 if (access.DoesFileExist(path)) {
253 return true;
254 }
255
256 path[path.size()-2] = 'i';
257 path[path.size()-1] = 'n';
258
259 if (access.DoesFileExist(path)) {
260 return true;
261 }
262 }
263
264 return false;
265 }
266
267
268 /// Returns the character used to seperate path components in the
269 /// current operating system or platform.
s_GetPathSplitter()270 static string s_GetPathSplitter()
271 {
272 const char * splitter = 0;
273
274 #if defined(NCBI_OS_UNIX)
275 splitter = ":";
276 #else
277 splitter = ";";
278 #endif
279
280 return splitter;
281 }
282
283
SeqDB_ConvertOSPath(string & dbs)284 void SeqDB_ConvertOSPath(string & dbs)
285 {
286 // See also CDirEntry::ConvertToOSPath()
287
288 char delim = CDirEntry::GetPathSeparator();
289
290 for(size_t i = 0; i<dbs.size(); i++) {
291 if (dbs[i] == '/' || dbs[i] == '\\') {
292 dbs[i] = delim;
293 }
294 }
295 }
296
297
SeqDB_MakeOSPath(const string & dbs)298 string SeqDB_MakeOSPath(const string & dbs)
299 {
300 string cvt(dbs);
301 SeqDB_ConvertOSPath(cvt);
302 return cvt;
303 }
304
305
306 /// Search for a file in a provided set of paths
307 ///
308 /// This function takes a search path as a ":" delimited set of path
309 /// names, and searches in those paths for the given database
310 /// component. The component name may include path components. If
311 /// the exact flag is set, the path is assumed to contain any required
312 /// extension; otherwise extensions for index and alias files will be
313 /// tried. Each element of the search path is tried in sequential
314 /// order for both index or alias files (if exact is not set), before
315 /// moving to the next element of the search path. The path returned
316 /// from this function will not contain a file extension unless the
317 /// provided filename did (in which case, exact is normally set).
318 ///
319 /// @param blast_paths
320 /// List of filesystem paths seperated by ":".
321 /// @param dbname
322 /// Base name of the database index or alias file to search for.
323 /// @param dbtype
324 /// Type of database, either protein or nucleotide.
325 /// @param exact
326 /// Set to true if dbname already contains any needed extension.
327 /// @param linkoutdb_search
328 /// Determines whether linkoutdb files should be searched for
329 /// @return
330 /// Full pathname, minus extension, or empty string if none found.
331
s_SeqDB_TryPaths(const string & blast_paths,const string & dbname,char dbtype,bool exact,CSeqDB_FileExistence & access,bool linkoutdb_search=false)332 static string s_SeqDB_TryPaths(const string & blast_paths,
333 const string & dbname,
334 char dbtype,
335 bool exact,
336 CSeqDB_FileExistence & access,
337 bool linkoutdb_search = false)
338 {
339 // 1. If this was a vector<CSeqDB_Substring>, the tokenize would
340 // not need to do any allocations (but would need rewriting).
341 //
342 // 2. If this was split into several functions, and/or a stateful
343 // class was used, this would perform better here, and would
344 // allow improvement of the search routine for combined group
345 // indices (see comments in CSeqDBAliasSets::FindAliasPath).
346
347 vector<string> roads;
348 NStr::Split(blast_paths, s_GetPathSplitter(), roads, NStr::fSplit_Tokenize);
349
350 string result;
351 string attempt;
352
353 ITERATE(vector<string>, road, roads) {
354 attempt.erase();
355
356 SeqDB_CombinePath(CSeqDB_Substring(SeqDB_MakeOSPath(*road)),
357 CSeqDB_Substring(dbname),
358 0,
359 attempt);
360
361 if (exact) {
362 if (access.DoesFileExist(attempt)) {
363 result = attempt;
364 break;
365 }
366 } else {
367 if (s_SeqDB_DBExists(attempt, dbtype, access, linkoutdb_search)) {
368 result = attempt;
369 break;
370 }
371 }
372 }
373
374 return result;
375 }
376
377 static string
s_SeqDB_FindBlastDBPath(const string & dbname,char dbtype,string * sp,bool exact,CSeqDB_FileExistence & access,const string path="")378 s_SeqDB_FindBlastDBPath(const string & dbname,
379 char dbtype,
380 string * sp,
381 bool exact,
382 CSeqDB_FileExistence & access,
383 const string path="")
384 {
385 const string pathology = (path=="") ? CSeqDBAtlas::GenerateSearchPath() : path;
386
387 if (sp) {
388 *sp = pathology;
389 }
390
391 return s_SeqDB_TryPaths(pathology, dbname, dbtype, exact, access);
392 }
393
394 /// Check file existence using CSeqDBAtlas.
395 class CSeqDB_AtlasAccessor : public CSeqDB_FileExistence {
396 public:
397 /// Constructor.
CSeqDB_AtlasAccessor(CSeqDBAtlas & atlas)398 CSeqDB_AtlasAccessor(CSeqDBAtlas & atlas)
399 : m_Atlas (atlas)
400 {
401 }
402
403 /// Test file existence.
404 /// @param fname Fully qualified name of file for which to look.
405 /// @return True iff file exists.
DoesFileExist(const string & fname)406 virtual bool DoesFileExist(const string & fname)
407 {
408 return m_Atlas.DoesFileExist(fname);
409 }
410
411 private:
412 CSeqDBAtlas & m_Atlas;
413 };
414
415
SeqDB_FindBlastDBPath(const string & dbname,char dbtype,string * sp,bool exact,CSeqDBAtlas & atlas)416 string SeqDB_FindBlastDBPath(const string & dbname,
417 char dbtype,
418 string * sp,
419 bool exact,
420 CSeqDBAtlas & atlas)
421 {
422 CSeqDB_AtlasAccessor access(atlas);
423
424 return s_SeqDB_FindBlastDBPath(dbname,
425 dbtype,
426 sp,
427 exact,
428 access,
429 atlas.GetSearchPath());
430 }
431
432
433 /// Check file existence using CFile.
434 class CSeqDB_SimpleAccessor : public CSeqDB_FileExistence {
435 public:
436 /// Constructor.
CSeqDB_SimpleAccessor()437 CSeqDB_SimpleAccessor()
438 {
439 }
440
441 /// Test file existence.
442 /// @param fname Fully qualified name of file for which to look.
443 /// @return True iff file exists.
DoesFileExist(const string & fname)444 virtual bool DoesFileExist(const string & fname)
445 {
446 // Use the same criteria as the Atlas code would.
447 CFile whole(SeqDB_MakeOSPath(fname));
448 return whole.GetLength() != (Int8) -1;
449 }
450 };
451
452
SeqDB_ResolveDbPath(const string & filename)453 string SeqDB_ResolveDbPath(const string & filename)
454 {
455 CSeqDB_SimpleAccessor access;
456
457 return s_SeqDB_FindBlastDBPath(filename,
458 '-',
459 0,
460 true,
461 access);
462 }
463
SeqDB_ResolveDbPathNoExtension(const string & filename,char dbtype)464 string SeqDB_ResolveDbPathNoExtension(const string & filename,
465 char dbtype /* = '-' */)
466 {
467 CSeqDB_SimpleAccessor access;
468
469 return s_SeqDB_FindBlastDBPath(filename, dbtype, 0, false, access);
470 }
471
SeqDB_ResolveDbPathForLinkoutDB(const string & filename)472 string SeqDB_ResolveDbPathForLinkoutDB(const string & filename)
473 {
474 const char dbtype('p'); // this is determined by blastdb_links application
475 CSeqDB_SimpleAccessor access;
476 const string pathology = CSeqDBAtlas::GenerateSearchPath();
477 return s_SeqDB_TryPaths(pathology, filename, dbtype, false, access, true);
478 }
479
SeqDB_JoinDelim(string & a,const string & b,const string & delim)480 void SeqDB_JoinDelim(string & a, const string & b, const string & delim)
481 {
482 if (b.empty()) {
483 return;
484 }
485
486 if (a.empty()) {
487 // a has no size - but might have capacity
488 s_SeqDB_QuickAssign(a, b);
489 return;
490 }
491
492 size_t newlen = a.length() + b.length() + delim.length();
493
494 if (a.capacity() < newlen) {
495 size_t newcap = 16;
496
497 while(newcap < newlen) {
498 newcap <<= 1;
499 }
500
501 a.reserve(newcap);
502 }
503
504 a += delim;
505 a += b;
506 }
507
508
CSeqDBGiList()509 CSeqDBGiList::CSeqDBGiList()
510 : m_CurrentOrder(eNone)
511 {
512 }
513
514
515 /// Compare SGiOid structs by OID.
516 class CSeqDB_SortOidLessThan {
517 public:
518 /// Test whether lhs is less than (occurs before) rhs.
519 /// @param lhs Left hand side of less-than operator. [in]
520 /// @param rhs Right hand side of less-than operator. [in]
521 /// @return True if lhs has a lower OID than rhs.
operator ()(const CSeqDBGiList::SGiOid & lhs,const CSeqDBGiList::SGiOid & rhs)522 int operator()(const CSeqDBGiList::SGiOid & lhs,
523 const CSeqDBGiList::SGiOid & rhs)
524 {
525 return lhs.oid < rhs.oid;
526 }
527 };
528
529
530 /// Compare SGiOid structs by GI.
531 class CSeqDB_SortGiLessThan {
532 public:
533 /// Test whether lhs is less than (occurs before) rhs.
534 /// @param lhs Left hand side of less-than operator. [in]
535 /// @param rhs Right hand side of less-than operator. [in]
536 /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::SGiOid & lhs,const CSeqDBGiList::SGiOid & rhs)537 int operator()(const CSeqDBGiList::SGiOid & lhs,
538 const CSeqDBGiList::SGiOid & rhs)
539 {
540 return lhs.gi < rhs.gi;
541 }
542 };
543
544 class CSeqDB_SortPigLessThan {
545 public:
546 /// Test whether lhs is less than (occurs before) rhs.
547 /// @param lhs Left hand side of less-than operator. [in]
548 /// @param rhs Right hand side of less-than operator. [in]
549 /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::SPigOid & lhs,const CSeqDBGiList::SPigOid & rhs)550 int operator()(const CSeqDBGiList::SPigOid & lhs,
551 const CSeqDBGiList::SPigOid & rhs)
552 {
553 return lhs.pig < rhs.pig;
554 }
555 };
556
557
558
559 /// Compare SGiOid structs by GI.
560 class CSeqDB_SortTiLessThan {
561 public:
562 /// Test whether lhs is less than (occurs before) rhs.
563 /// @param lhs Left hand side of less-than operator. [in]
564 /// @param rhs Right hand side of less-than operator. [in]
565 /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::STiOid & lhs,const CSeqDBGiList::STiOid & rhs)566 int operator()(const CSeqDBGiList::STiOid & lhs,
567 const CSeqDBGiList::STiOid & rhs)
568 {
569 return lhs.ti < rhs.ti;
570 }
571 };
572
573
574 /// Compare SSeqIdOid structs by SeqId.
575 class CSeqDB_SortSiLessThan {
576 public:
577 /// Test whether lhs is less than (occurs before) rhs.
578 /// @param lhs Left hand side of less-than operator. [in]
579 /// @param rhs Right hand side of less-than operator. [in]
580 /// @return True if lhs sorts before rhs by Seq-id.
operator ()(const CSeqDBGiList::SSiOid & lhs,const CSeqDBGiList::SSiOid & rhs)581 int operator()(const CSeqDBGiList::SSiOid & lhs,
582 const CSeqDBGiList::SSiOid & rhs)
583 {
584 return lhs.si < rhs.si;
585 }
586 };
587
588
589 template<class TCompare, class TVector>
s_InsureOrder(TVector & v)590 void s_InsureOrder(TVector & v)
591 {
592 bool already = true;
593
594 TCompare compare_less;
595
596 for(int i = 1; i < (int) v.size(); i++) {
597 if (compare_less(v[i], v[i-1])) {
598 already = false;
599 break;
600 }
601 }
602
603 if (! already) {
604 sort(v.begin(), v.end(), compare_less);
605 }
606 }
607
PreprocessIdsForISAMSiLookup()608 void CSeqDBGiList::PreprocessIdsForISAMSiLookup()
609 {
610 NON_CONST_ITERATE(vector<CSeqDBGiList::SSiOid>, itr, m_SisOids) {
611 string str_id = SeqDB_SimplifyAccession(itr->si);
612 itr->si = NStr::ToLower(str_id);
613 }
614 }
615
InsureOrder(ESortOrder order)616 void CSeqDBGiList::InsureOrder(ESortOrder order)
617 {
618 // Code depends on OID order after translation, because various
619 // methods of SeqDB use this class for filtering purposes.
620 static CFastMutex mtx;
621 CFastMutexGuard mtx_gurad(mtx);
622 if ((order < m_CurrentOrder) || (order == eNone)) {
623 NCBI_THROW(CSeqDBException,
624 eFileErr,
625 "Out of sequence sort order requested.");
626 }
627
628 // Input is usually sorted by GI, so we first test for sortedness.
629 // If it will fail it will probably do so almost immediately.
630
631 if (order != m_CurrentOrder) {
632 switch(order) {
633 case eNone:
634 break;
635
636 case eGi:
637 s_InsureOrder<CSeqDB_SortGiLessThan>(m_GisOids);
638 s_InsureOrder<CSeqDB_SortTiLessThan>(m_TisOids);
639 s_InsureOrder<CSeqDB_SortSiLessThan>(m_SisOids);
640 s_InsureOrder<CSeqDB_SortPigLessThan>(m_PigsOids);
641 break;
642
643 default:
644 NCBI_THROW(CSeqDBException,
645 eFileErr,
646 "Unrecognized sort order requested.");
647 }
648
649 m_CurrentOrder = order;
650 }
651 }
652
653
FindGi(TGi gi) const654 bool CSeqDBGiList::FindGi(TGi gi) const
655 {
656 int oid(0), index(0);
657 return (const_cast<CSeqDBGiList *>(this))->GiToOid(gi, oid, index);
658 }
659
660
GiToOid(TGi gi,int & oid)661 bool CSeqDBGiList::GiToOid(TGi gi, int & oid)
662 {
663 int index(0);
664 return GiToOid(gi, oid, index);
665 }
666
667
GiToOid(TGi gi,int & oid,int & index)668 bool CSeqDBGiList::GiToOid(TGi gi, int & oid, int & index)
669 {
670 InsureOrder(eGi); // would assert be better?
671
672 int b(0), e((int)m_GisOids.size());
673
674 while(b < e) {
675 int m = (b + e)/2;
676 TGi m_gi = m_GisOids[m].gi;
677
678 if (m_gi < gi) {
679 b = m + 1;
680 } else if (m_gi > gi) {
681 e = m;
682 } else {
683 oid = m_GisOids[m].oid;
684 index = m;
685 return true;
686 }
687 }
688
689 oid = index = -1;
690 return false;
691 }
692
693
FindTi(TTi ti) const694 bool CSeqDBGiList::FindTi(TTi ti) const
695 {
696 int oid(0), index(0);
697 return (const_cast<CSeqDBGiList *>(this))->TiToOid(ti, oid, index);
698 }
699
700
TiToOid(TTi ti,int & oid)701 bool CSeqDBGiList::TiToOid(TTi ti, int & oid)
702 {
703 int index(0);
704 return TiToOid(ti, oid, index);
705 }
706
707
TiToOid(TTi ti,int & oid,int & index)708 bool CSeqDBGiList::TiToOid(TTi ti, int & oid, int & index)
709 {
710 InsureOrder(eGi); // would assert be better?
711
712 int b(0), e((int)m_TisOids.size());
713
714 while(b < e) {
715 int m = (b + e)/2;
716 TTi m_ti = m_TisOids[m].ti;
717
718 if (m_ti < ti) {
719 b = m + 1;
720 } else if (m_ti > ti) {
721 e = m;
722 } else {
723 oid = m_TisOids[m].oid;
724 index = m;
725 return true;
726 }
727 }
728
729 oid = index = -1;
730 return false;
731 }
732
FindSi(const string & si) const733 bool CSeqDBGiList::FindSi(const string &si) const
734 {
735 int oid(0), index(0);
736 return (const_cast<CSeqDBGiList *>(this))->SiToOid(si, oid, index);
737 }
738
SiToOid(const string & si,int & oid)739 bool CSeqDBGiList::SiToOid(const string &si, int & oid)
740 {
741 int index(0);
742 return SiToOid(si, oid, index);
743 }
744
SiToOid(const string & si,int & oid,int & index)745 bool CSeqDBGiList::SiToOid(const string &si, int & oid, int & index)
746 {
747 InsureOrder(eGi);
748
749 int b(0), e((int)m_SisOids.size());
750
751 while(b < e) {
752 int m = (b + e)/2;
753 const string & m_si = m_SisOids[m].si;
754
755 if (m_si < si) {
756 b = m + 1;
757 } else if (si < m_si) {
758 e = m;
759 } else {
760 oid = m_SisOids[m].oid;
761 index = m;
762 return true;
763 }
764 }
765
766 oid = index = -1;
767 return false;
768 }
769
770 void
GetGiList(vector<TGi> & gis) const771 CSeqDBGiList::GetGiList(vector<TGi>& gis) const
772 {
773 gis.clear();
774 gis.reserve(GetNumGis());
775
776 ITERATE(vector<SGiOid>, itr, m_GisOids) {
777 gis.push_back(itr->gi);
778 }
779 }
780
781 void
GetPigList(vector<TPig> & pigs) const782 CSeqDBGiList::GetPigList(vector<TPig>& pigs) const
783 {
784 pigs.clear();
785 pigs.reserve(GetNumPigs());
786
787 ITERATE(vector<SPigOid>, itr, m_PigsOids) {
788 pigs.push_back(itr->pig);
789 }
790 }
791
792 void
GetTiList(vector<TTi> & tis) const793 CSeqDBGiList::GetTiList(vector<TTi>& tis) const
794 {
795 tis.clear();
796 tis.reserve(GetNumTis());
797
798 ITERATE(vector<STiOid>, itr, m_TisOids) {
799 tis.push_back(itr->ti);
800 }
801 }
802
803
804 void
GetSiList(vector<string> & sis) const805 CSeqDBGiList::GetSiList(vector<string>& sis) const
806 {
807 sis.clear();
808 sis.reserve(GetNumSis());
809
810 ITERATE(vector<SSiOid>, itr, m_SisOids) {
811 sis.push_back(itr->si);
812 }
813 }
814
815
816
817
818
SeqDB_ReadBinaryGiList(const string & fname,vector<TGi> & gis)819 void SeqDB_ReadBinaryGiList(const string & fname, vector<TGi> & gis)
820 {
821 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
822
823 Uint4 * beginp = (Uint4*) mfile.GetPtr();
824 Uint4 * endp = (Uint4*) (((char*)mfile.GetPtr()) + mfile.GetSize());
825
826 Int4 num_gis = (Int4) (endp - beginp) - 2;
827
828 gis.clear();
829
830 if (((endp - beginp) < 2U)
831 || (beginp[0] != 0xFFFFFFFFU)
832 || (SeqDB_GetStdOrd(beginp + 1) != (Uint4) num_gis)) {
833 NCBI_THROW(CSeqDBException,
834 eFileErr,
835 "Specified file is not a valid binary GI file.");
836 }
837
838 gis.reserve(num_gis);
839
840 for(Uint4 * elem = (beginp + 2); elem < endp; ++elem) {
841 gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
842 }
843 }
844
845 /// This function determines whether a file is a valid binary GI/TI file.
846 /// @param fbeginp pointer to start of file [in]
847 /// @param fendp pointer to end of file [in]
848 /// @param has_long_ids will be set to true if the gi file contains long IDs [out]
849 /// @param has_tis will be set to true if the input file contains Trace IDs,
850 /// otherwise the file contains GIs [out]
851 /// @returns true if file is binary
852 /// @throws CSeqDBException if file is empty or invalid gi file
853 static
s_SeqDB_IsBinaryNumericList(const char * fbeginp,const char * fendp,bool & has_long_ids,bool * has_tis=NULL)854 bool s_SeqDB_IsBinaryNumericList(const char* fbeginp, const char* fendp,
855 bool& has_long_ids, bool* has_tis = NULL)
856 {
857 bool retval = false;
858 has_long_ids = false;
859 if (has_tis)
860 *has_tis = false;
861 Uint8 file_size = fendp - fbeginp;
862
863 if (file_size == 0) {
864 NCBI_THROW(CSeqDBException,
865 eFileErr,
866 "Specified file is empty.");
867 } else if (isdigit((unsigned char)(*((char*) fbeginp))) ||
868 ((unsigned char)(*((char*) fbeginp)) == '#')) {
869 retval = false;
870 } else if ((file_size >= 8) && ((*fbeginp & 0xFF) == 0xFF)) {
871 retval = true;
872
873 int marker = fbeginp[3] & 0xFF;
874
875 if (marker == 0xFE || marker == 0xFC) {
876 has_long_ids = true;
877 }
878 if (has_tis && (marker == 0xFD || marker == 0xFC)) {
879 *has_tis = true;
880 }
881 } else {
882 NCBI_THROW(CSeqDBException,
883 eFileErr,
884 "Specified file is not a valid GI/TI list.");
885 }
886 return retval;
887 }
888
s_ReadDigit(const char d,const string & list_type)889 int s_ReadDigit(const char d, const string & list_type)
890 {
891 switch(d) {
892 case '0':
893 return 0;
894 case '1':
895 return 1;
896 case '2':
897 return 2;
898 case '3':
899 return 3;
900 case '4':
901 return 4;
902 case '5':
903 return 5;
904 case '6':
905 return 6;
906 case '7':
907 return 7;
908 case '8':
909 return 8;
910 case '9':
911 return 9;
912 case ' ':
913 case '\n':
914 case '\r':
915 return -1;
916 default:
917 {
918 string msg = string("Invalid byte in text" + list_type + " list [") +
919 NStr::UIntToString((unsigned char) d) + "].";
920 NCBI_THROW(CSeqDBException, eFileErr, msg);
921 }
922 }
923 }
924
SeqDB_ReadMemoryGiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SGiOid> & gis,bool * in_order)925 void SeqDB_ReadMemoryGiList(const char * fbeginp,
926 const char * fendp,
927 vector<CSeqDBGiList::SGiOid> & gis,
928 bool * in_order)
929 {
930 bool long_ids = false;
931 Uint8 file_size = fendp - fbeginp;
932
933 if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
934 _ASSERT(long_ids == false);
935 Uint4* bbeginp = (Uint4*) fbeginp;
936 Uint4* bendp = (Uint4*) fendp;
937
938 Uint8 num_gis = bendp - bbeginp - 2;
939
940 gis.clear();
941
942 if ((bbeginp[0] != 0xFFFFFFFFU)
943 || (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_gis)) {
944 NCBI_THROW(CSeqDBException,
945 eFileErr,
946 "Specified file is not a valid binary GI file.");
947 }
948
949 gis.reserve(num_gis);
950
951 if (in_order) {
952 TGi prev_gi = ZERO_GI;
953 bool in_gi_order = true;
954
955 Uint4* elem = bbeginp + 2;
956 while(elem < bendp) {
957 TGi this_gi = GI_FROM(Uint4, SeqDB_GetStdOrd(elem));
958 gis.push_back(this_gi);
959
960 if (prev_gi > this_gi) {
961 in_gi_order = false;
962 break;
963 }
964 prev_gi = this_gi;
965 elem++;
966 }
967
968 while(elem < bendp) {
969 gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem++)));
970 }
971
972 *in_order = in_gi_order;
973 } else {
974 for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
975 gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
976 }
977 }
978 } else {
979 _ASSERT(long_ids == false);
980 // We would prefer to do only one allocation, so assume
981 // average gi is 6 digits plus newline. A few extra will be
982 // allocated, but this is preferable to letting the vector
983 // double itself (which it still will do if needed).
984
985 gis.reserve((int) (file_size / 7));
986
987 Uint4 elem(0);
988 const string list_type("GI");
989
990 for(const char * p = fbeginp; p < fendp; p ++) {
991 int dig = s_ReadDigit(*p, list_type);
992 if (dig == -1) {
993 if (elem != 0) {
994 gis.push_back(GI_FROM(Uint4, elem));
995 }
996 elem = 0;
997 continue;
998 }
999 elem *= 10;
1000 elem += dig;
1001 }
1002 }
1003 }
1004
SeqDB_ReadMemoryPigList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SPigOid> & pigs,bool * in_order)1005 void SeqDB_ReadMemoryPigList(const char * fbeginp,
1006 const char * fendp,
1007 vector<CSeqDBGiList::SPigOid> & pigs,
1008 bool * in_order)
1009 {
1010 bool long_ids = false;
1011 Int8 file_size = fendp - fbeginp;
1012
1013 if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1014 Uint4* bbeginp = (Uint4*) fbeginp;
1015 Uint4* bendp = (Uint4*) fendp;
1016
1017 Int4 num_pigs = (Int4) (bendp - bbeginp) - 2;
1018
1019 pigs.clear();
1020
1021 if (((bendp - bbeginp) < 2U)
1022 || (bbeginp[0] != 0xFFFFFFFFU)
1023 || (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_pigs)) {
1024 NCBI_THROW(CSeqDBException,
1025 eFileErr,
1026 "Specified file is not a valid binary IPG file.");
1027 }
1028
1029 pigs.reserve(num_pigs);
1030
1031 if (in_order) {
1032 TPig prev_pig = 0;
1033 bool sorted = true;
1034
1035 Uint4* elem = bbeginp + 2;
1036 while(elem < bendp) {
1037 TPig this_pig = SeqDB_GetStdOrd(elem);
1038 pigs.push_back(this_pig);
1039
1040 if (prev_pig > this_pig) {
1041 sorted = false;
1042 break;
1043 }
1044 prev_pig = this_pig;
1045 elem++;
1046 }
1047
1048 while(elem < bendp) {
1049 pigs.push_back(SeqDB_GetStdOrd(elem++));
1050 }
1051
1052 *in_order = sorted;
1053 } else {
1054 for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1055 pigs.push_back(SeqDB_GetStdOrd(elem));
1056 }
1057 }
1058 } else {
1059 pigs.reserve((int) (file_size / 7));
1060
1061 Uint4 elem(0);
1062 const string list_type("IPG");
1063
1064 for(const char * p = fbeginp; p < fendp; p ++) {
1065 int dig = s_ReadDigit(*p, list_type);
1066 if (dig == -1) {
1067 // Skip blank lines or comments by ignoring zero.
1068 if (elem != 0) {
1069 pigs.push_back(elem);
1070 }
1071 elem = 0;
1072 continue;
1073 }
1074 elem *= 10;
1075 elem += dig;
1076 }
1077 }
1078 }
1079
SeqDB_ReadMemoryTaxIdList(const char * fbeginp,const char * fendp,CSeqDBGiList::STaxIdsOids & taxids)1080 void SeqDB_ReadMemoryTaxIdList(const char * fbeginp,
1081 const char * fendp,
1082 CSeqDBGiList::STaxIdsOids & taxids)
1083 {
1084 bool long_ids = false;
1085 if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1086 Int4* bbeginp = (Int4*) fbeginp;
1087 Int4* bendp = (Int4*) fendp;
1088
1089 Uint8 num_taxids = (bendp - bbeginp) - 2;
1090
1091 taxids.tax_ids.clear();
1092 taxids.oids.clear();
1093
1094 if (((bendp - bbeginp) < 2) || (bbeginp[0] != 0xFFFFFFFF)
1095 || (SeqDB_GetStdOrd(bbeginp + 1) != (Int4) num_taxids)) {
1096 NCBI_THROW(CSeqDBException, eFileErr,
1097 "Specified file is not a valid binary Tax Id List file.");
1098 }
1099
1100 for(Int4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1101 taxids.tax_ids.insert(TAX_ID_FROM(Int4, SeqDB_GetStdOrd(elem)));
1102 }
1103 } else {
1104 Int4 elem(0);
1105 const string list_type("TAXID");
1106
1107 for(const char * p = fbeginp; p < fendp; p ++) {
1108 int dig = s_ReadDigit(*p, list_type);
1109 if (dig == -1) {
1110 // Skip blank lines or comments by ignoring zero.
1111 if (elem != 0) {
1112 taxids.tax_ids.insert(TAX_ID_FROM(Int4, elem));
1113 }
1114 elem = 0;
1115 continue;
1116 }
1117 elem *= 10;
1118 elem += dig;
1119 }
1120 }
1121 }
1122
1123 // [ NOTE: The 8 byte versions described here are not yet
1124 // implemented. ]
1125 //
1126 // FF..FF = -1 -> GI list <32 bit>
1127 // FF..FE = -2 -> GI list <64 bit>
1128 // FF..FD = -3 -> TI list <32 bit>
1129 // FF..FC = -4 -> TI list <64 bit>
1130 //
1131 // Format of the 8 byte TI list; note that we are still limited to
1132 // 2^32-1 TIs, which would involve an 32 GB identifier list file; this
1133 // code (in its current form) will not work at all on a 32 bit system
1134 // for GI files with more than about 500 megasequences, or TI files
1135 // with more than about 256 megasequences, assuming the current 16
1136 // bytes per vector element. This is larger than the current total
1137 // number of GI sequences, but not larger than the number of TIs, so a
1138 // TI query for all TIs everywhere will most likely choke on a 32 bit
1139 // system because the data will simply not fit into memory (there are
1140 // nearly that many active TIs and the program will have other memory
1141 // expenditures.)
1142 //
1143 // 4 bytes: FF FF FF F?
1144 // 4 bytes: <number of TIs>
1145 // 8 bytes: TI#0
1146 // 8 bytes: TI#1
1147 // ...
1148
SeqDB_ReadMemoryTiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::STiOid> & tis,bool * in_order)1149 void SeqDB_ReadMemoryTiList(const char * fbeginp,
1150 const char * fendp,
1151 vector<CSeqDBGiList::STiOid> & tis,
1152 bool * in_order)
1153 {
1154 bool long_ids = false;
1155 Int8 file_size = fendp - fbeginp;
1156
1157 if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1158 Int4 * bbeginp = (Int4*) fbeginp;
1159 Int4 * bendp = (Int4*) fendp;
1160 Int4 * bdatap = bbeginp + 2;
1161
1162 Uint4 num_tis = (int)(bendp-bdatap);
1163
1164 int remainder = num_tis % 2;
1165
1166 if (long_ids) {
1167 num_tis /= 2;
1168 }
1169
1170 tis.clear();
1171
1172 bool bad_fmt = false;
1173
1174 if (bendp < bdatap) {
1175 bad_fmt = true;
1176 } else {
1177 int marker = SeqDB_GetStdOrd(bbeginp);
1178 unsigned num_ids = SeqDB_GetStdOrd(bbeginp+1);
1179
1180 if ((marker != -3 && marker != -4) ||
1181 (num_ids != num_tis) ||
1182 (remainder && long_ids)) {
1183
1184 bad_fmt = true;
1185 }
1186 }
1187
1188 if (bad_fmt) {
1189 NCBI_THROW(CSeqDBException,
1190 eFileErr,
1191 "Specified file is not a valid binary GI or TI file.");
1192 }
1193
1194 tis.reserve(num_tis);
1195
1196 if (long_ids) {
1197 Int8 * bdatap8 = (Int8*) bdatap;
1198 Int8 * bendp8 = (Int8*) bendp;
1199
1200 if (in_order) {
1201 Int8 prev_ti =0;
1202 bool in_ti_order = true;
1203
1204 Int8 * elem = bdatap8;
1205
1206 while(elem < bendp8) {
1207 Int8 this_ti = (Int8) SeqDB_GetStdOrd(elem);
1208 tis.push_back(this_ti);
1209
1210 if (prev_ti > this_ti) {
1211 in_ti_order = false;
1212 break;
1213 }
1214 prev_ti = this_ti;
1215 elem ++;
1216 }
1217
1218 while(elem < bendp8) {
1219 tis.push_back((Int8) SeqDB_GetStdOrd(elem++));
1220 }
1221
1222 *in_order = in_ti_order;
1223 } else {
1224 for(Int8 * elem = bdatap8; elem < bendp8; elem ++) {
1225 tis.push_back((Int8) SeqDB_GetStdOrd(elem));
1226 }
1227 }
1228 } else {
1229 if (in_order) {
1230 int prev_ti =0;
1231 bool in_ti_order = true;
1232
1233 Int4 * elem = bdatap;
1234
1235 while(elem < bendp) {
1236 int this_ti = (int) SeqDB_GetStdOrd(elem);
1237 tis.push_back(this_ti);
1238
1239 if (prev_ti > this_ti) {
1240 in_ti_order = false;
1241 break;
1242 }
1243 prev_ti = this_ti;
1244 elem ++;
1245 }
1246
1247 while(elem < bendp) {
1248 tis.push_back((int) SeqDB_GetStdOrd(elem++));
1249 }
1250
1251 *in_order = in_ti_order;
1252 } else {
1253 for(Int4 * elem = bdatap; elem < bendp; elem ++) {
1254 tis.push_back((int) SeqDB_GetStdOrd(elem));
1255 }
1256 }
1257 }
1258 } else {
1259 // We would prefer to do only one allocation, so assume
1260 // average gi is 6 digits plus newline. A few extra will be
1261 // allocated, but this is preferable to letting the vector
1262 // double itself (which it still will do if needed).
1263
1264 tis.reserve(int(file_size / 7));
1265
1266 Int8 elem(0);
1267 const string list_type("TI");
1268
1269 for(const char * p = fbeginp; p < fendp; p ++) {
1270 int dig = s_ReadDigit(*p, list_type);
1271 if (dig == -1) {
1272 if (elem != 0) {
1273 tis.push_back(elem);
1274 }
1275 elem = 0;
1276 continue;
1277 }
1278 elem *= 10;
1279 elem += dig;
1280 }
1281 }
1282 }
1283
SeqDB_ReadMemorySiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1284 void SeqDB_ReadMemorySiList(const char * fbeginp,
1285 const char * fendp,
1286 vector<CSeqDBGiList::SSiOid> & sis,
1287 bool * in_order)
1288 {
1289 Int8 file_size = fendp - fbeginp;
1290
1291 // We would prefer to do only one allocation, so assume
1292 // average seqid is 6 digits plus newline. A few extra will be
1293 // allocated, but this is preferable to letting the vector
1294 // double itself (which it still will do if needed).
1295
1296 sis.reserve(sis.size() + int(file_size / 7));
1297
1298 const char * p = fbeginp;
1299 const char * head;
1300 while ( p < fendp) {
1301 // find the head of the seqid
1302 while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1303 if (p< fendp && *p == '#') {
1304 // anything beyond this point in the line is a comment
1305 while (p< fendp && *p!='\n') ++p;
1306 continue;
1307 }
1308 head = p;
1309 while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1310 if (p > head) {
1311 string acc(head, p);
1312 string str_id = NStr::TruncateSpaces(acc, NStr::eTrunc_Both);
1313 if (str_id != "") {
1314 sis.push_back(str_id);
1315 } else {
1316 cerr << "WARNING: " << acc
1317 << " is not a valid seqid string." << endl;
1318 }
1319 }
1320 }
1321 if (in_order) *in_order = false;
1322 }
1323
SeqDB_ReadMemoryMixList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SGiOid> & gis,vector<CSeqDBGiList::STiOid> & tis,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1324 void SeqDB_ReadMemoryMixList(const char * fbeginp,
1325 const char * fendp,
1326 vector<CSeqDBGiList::SGiOid> & gis,
1327 vector<CSeqDBGiList::STiOid> & tis,
1328 vector<CSeqDBGiList::SSiOid> & sis,
1329 bool * in_order)
1330 {
1331 Int8 file_size = fendp - fbeginp;
1332
1333 // We would prefer to do only one allocation, so assume
1334 // average seqid is 6 digits plus newline. A few extra will be
1335 // allocated, but this is preferable to letting the vector
1336 // double itself (which it still will do if needed).
1337
1338 sis.reserve(sis.size() + int(file_size / 7));
1339
1340 const char * p = fbeginp;
1341 const char * head;
1342 while ( p < fendp) {
1343 // find the head of the seqid
1344 while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1345 if (p< fendp && *p == '#') {
1346 // anything beyond this point in the line is a comment
1347 while (p< fendp && *p!='\n') ++p;
1348 continue;
1349 }
1350 head = p;
1351 while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1352 if (p > head) {
1353 string acc(head, p);
1354 string str_id;
1355 Int8 num_id;
1356 bool simpler;
1357 ESeqDBIdType id_type = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
1358 if (eStringId == id_type) {
1359 sis.push_back(NStr::ToLower(str_id));
1360 }
1361 else if (eTiId == id_type) {
1362 tis.push_back((TTi) num_id);
1363 }
1364 else if (eGiId == id_type) {
1365 gis.push_back(GI_FROM(Int8, num_id));
1366 }
1367 else {
1368 cerr << "WARNING: " << acc
1369 << " is not a valid seqid string." << endl;
1370 }
1371 }
1372 }
1373 if (in_order) *in_order = false;
1374 }
1375
s_ContainsBinaryNumericIdList(const string & fname,CSeqDBFileGiList::EIdType type)1376 static bool s_ContainsBinaryNumericIdList(const string& fname, CSeqDBFileGiList::EIdType type)
1377 {
1378 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1379
1380 Int8 file_size = mfile.GetSize();
1381 const char * fbeginp = (char*) mfile.GetPtr();
1382 const char * fendp = fbeginp + (int)file_size;
1383
1384 bool ignore = false;
1385 bool has_tis = false;
1386 bool retval = s_SeqDB_IsBinaryNumericList(fbeginp, fendp, ignore, &has_tis);
1387 if (type == CSeqDBFileGiList::eGiList) {
1388 return retval;
1389 } else {
1390 retval = has_tis && retval;
1391 }
1392 return retval;
1393 }
1394
SeqDB_IsBinaryTiList(const string & fname)1395 bool SeqDB_IsBinaryTiList(const string & fname)
1396 {
1397 return s_ContainsBinaryNumericIdList(fname, CSeqDBFileGiList::eTiList);
1398 }
1399
SeqDB_IsBinaryGiList(const string & fname)1400 bool SeqDB_IsBinaryGiList(const string & fname)
1401 {
1402 return s_ContainsBinaryNumericIdList(fname, CSeqDBFileGiList::eGiList);
1403 }
1404
SeqDB_ReadGiList(const string & fname,vector<CSeqDBGiList::SGiOid> & gis,bool * in_order)1405 void SeqDB_ReadGiList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis, bool * in_order)
1406 {
1407 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1408
1409 Int8 file_size = mfile.GetSize();
1410 const char * fbeginp = (char*) mfile.GetPtr();
1411 const char * fendp = fbeginp + file_size;
1412
1413 SeqDB_ReadMemoryGiList(fbeginp, fendp, gis, in_order);
1414 }
1415
1416
SeqDB_ReadTiList(const string & fname,vector<CSeqDBGiList::STiOid> & tis,bool * in_order)1417 void SeqDB_ReadTiList(const string & fname, vector<CSeqDBGiList::STiOid> & tis, bool * in_order)
1418 {
1419 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1420
1421 Int8 file_size = mfile.GetSize();
1422 const char * fbeginp = (char*) mfile.GetPtr();
1423 const char * fendp = fbeginp + file_size;
1424
1425 SeqDB_ReadMemoryTiList(fbeginp, fendp, tis, in_order);
1426 }
1427
SeqDB_ReadMixList(const string & fname,vector<CSeqDBGiList::SGiOid> & gis,vector<CSeqDBGiList::STiOid> & tis,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1428 void SeqDB_ReadMixList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis,
1429 vector<CSeqDBGiList::STiOid> & tis, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order)
1430 {
1431 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1432
1433 Int8 file_size = mfile.GetSize();
1434 const char *fbeginp = (char*) mfile.GetPtr();
1435 const char *fendp = fbeginp + file_size;
1436
1437 SeqDB_ReadMemoryMixList(fbeginp, fendp, gis, tis, sis, in_order);
1438 }
1439
SeqDB_ReadPigList(const string & fname,vector<CSeqDBGiList::SPigOid> & pigs,bool * in_order)1440 void SeqDB_ReadPigList(const string & fname, vector<CSeqDBGiList::SPigOid> & pigs, bool * in_order)
1441 {
1442 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1443
1444 Int8 file_size = mfile.GetSize();
1445 const char * fbeginp = (char*) mfile.GetPtr();
1446 const char * fendp = fbeginp + file_size;
1447
1448 SeqDB_ReadMemoryPigList(fbeginp, fendp, pigs, in_order);
1449 }
1450
SeqDB_ReadTaxIdList(const string & fname,CSeqDBGiList::STaxIdsOids & taxids)1451 void SeqDB_ReadTaxIdList(const string & fname, CSeqDBGiList::STaxIdsOids & taxids)
1452 {
1453 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1454
1455 Int8 file_size = mfile.GetSize();
1456 const char * fbeginp = (char*) mfile.GetPtr();
1457 const char * fendp = fbeginp + file_size;
1458
1459 SeqDB_ReadMemoryTaxIdList(fbeginp, fendp, taxids);
1460 }
1461
SeqDB_ReadGiList(const string & fname,vector<TGi> & gis,bool * in_order)1462 void SeqDB_ReadGiList(const string & fname, vector<TGi> & gis, bool * in_order)
1463 {
1464 typedef vector<CSeqDBGiList::SGiOid> TPairList;
1465
1466 TPairList pairs;
1467 SeqDB_ReadGiList(fname, pairs, in_order);
1468
1469 gis.reserve(pairs.size());
1470
1471 ITERATE(TPairList, iter, pairs) {
1472 gis.push_back(iter->gi);
1473 }
1474 }
1475
SeqDB_ReadSiList(const string & fname,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order,SBlastSeqIdListInfo & db_info)1476 void SeqDB_ReadSiList(const string & fname, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order, SBlastSeqIdListInfo & db_info)
1477 {
1478 CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1479 if (CBlastSeqidlistFile::GetSeqidlist(mfile, sis, db_info)) {
1480 *in_order = true;
1481 return;
1482 }
1483 else {
1484 Int8 file_size = mfile.GetSize();
1485 const char *fbeginp = (char*) mfile.GetPtr();
1486 const char *fendp = fbeginp + file_size;
1487 SeqDB_ReadMemorySiList(fbeginp, fendp, sis, in_order);
1488 }
1489 }
1490
FindGi(TGi gi)1491 bool CSeqDBNegativeList::FindGi(TGi gi)
1492 {
1493 InsureOrder();
1494 int b(0), e((int)m_Gis.size());
1495
1496 while(b < e) {
1497 int m = (b + e)/2;
1498 TGi m_gi = m_Gis[m];
1499
1500 if (m_gi < gi) {
1501 b = m + 1;
1502 } else if (m_gi > gi) {
1503 e = m;
1504 } else {
1505 return true;
1506 }
1507 }
1508
1509 return false;
1510 }
1511
1512
FindTi(TTi ti)1513 bool CSeqDBNegativeList::FindTi(TTi ti)
1514 {
1515 InsureOrder();
1516
1517 int b(0), e((int)m_Tis.size());
1518
1519 while(b < e) {
1520 int m = (b + e)/2;
1521 TTi m_ti = m_Tis[m];
1522
1523 if (m_ti < ti) {
1524 b = m + 1;
1525 } else if (m_ti > ti) {
1526 e = m;
1527 } else {
1528 return true;
1529 }
1530 }
1531
1532 return false;
1533 }
1534
FindId(const CSeq_id & id)1535 bool CSeqDBNegativeList::FindId(const CSeq_id & id)
1536 {
1537 bool match_type = false;
1538 return FindId(id, match_type);
1539 }
1540
1541
FindSi(string si)1542 bool CSeqDBNegativeList::FindSi(string si)
1543 {
1544 InsureOrder();
1545 int b(0), e((int)m_Sis.size());
1546
1547 while(b < e) {
1548 int m = (b + e)/2;
1549 string m_si = m_Sis[m];
1550
1551 if (m_si < si) {
1552 b = m + 1;
1553 } else if (m_si > si) {
1554 e = m;
1555 } else {
1556 return true;
1557 }
1558 }
1559
1560 return false;
1561 }
1562
1563
FindId(const CSeq_id & id,bool & match_type)1564 bool CSeqDBNegativeList::FindId(const CSeq_id & id, bool & match_type)
1565 {
1566 if (id.IsGi()) {
1567 match_type = (GetNumGis() > 0) ? true : false;
1568 if(match_type) {
1569 return(FindGi(id.GetGi()));
1570 }
1571 } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1572 match_type = (GetNumTis() > 0) ? true : false;
1573
1574 if(match_type) {
1575 const CObject_id & obj = id.GetGeneral().GetTag();
1576
1577 Int8 ti = (obj.IsId()
1578 ? obj.GetId()
1579 : NStr::StringToInt8(obj.GetStr()));
1580
1581 return FindTi(ti);
1582 }
1583 } else {
1584 match_type = (GetNumSis() > 0) ? true : false;
1585
1586 if(match_type) {
1587 if(FindSi(GetBlastSeqIdString(id, true))) return true;
1588 if(FindSi(GetBlastSeqIdString(id, false))) return true;
1589
1590 // For isam lookup
1591 Int8 num_id;
1592 string str_id;
1593 bool simpler;
1594
1595 SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1596
1597 if (FindSi(str_id)) {
1598 return true;
1599 }
1600
1601 // We may have to strip the version to find it...
1602 size_t pos = str_id.find(".");
1603 if (pos != str_id.npos) {
1604 string nover(str_id, 0, pos);
1605 return FindSi(nover);
1606 }
1607 }
1608 }
1609 return false;
1610 }
1611
PreprocessIdsForISAMSiLookup()1612 void CSeqDBNegativeList::PreprocessIdsForISAMSiLookup()
1613 {
1614 NON_CONST_ITERATE(vector<string>, itr, m_Sis) {
1615 string str_id = SeqDB_SimplifyAccession(*itr);
1616 *itr = NStr::ToLower(str_id);
1617 }
1618 }
1619
InsureOrder()1620 void CSeqDBNegativeList::InsureOrder()
1621 {
1622 static CFastMutex mtx;
1623 CFastMutexGuard mtx_gurad(mtx);
1624 if (m_LastSortSize != (m_Gis.size() + m_Tis.size() +m_Sis.size())) {
1625 std::sort(m_Gis.begin(), m_Gis.end());
1626 std::sort(m_Tis.begin(), m_Tis.end());
1627 std::sort(m_Sis.begin(), m_Sis.end());
1628
1629 m_LastSortSize = m_Gis.size() + m_Tis.size() + m_Sis.size();
1630 }
1631 }
1632
FindId(const CSeq_id & id)1633 bool CSeqDBGiList::FindId(const CSeq_id & id)
1634 {
1635 if (id.IsGi()) {
1636 return FindGi(id.GetGi());
1637 } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1638 const CObject_id & obj = id.GetGeneral().GetTag();
1639
1640 TTi ti = (obj.IsId()
1641 ? (TTi) obj.GetId()
1642 : (TTi) NStr::StringToInt8(obj.GetStr()));
1643
1644 return FindTi(ti);
1645 } else {
1646 if(FindSi(GetBlastSeqIdString(id, true))) return true;
1647 if(FindSi(GetBlastSeqIdString(id, false))) return true;
1648
1649 /// For isam lookup
1650 Int8 num_id;
1651 string str_id;
1652 bool simpler;
1653 SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1654 if (FindSi(str_id)) return true;
1655
1656 // We may have to strip the version to find it...
1657 size_t pos = str_id.find(".");
1658 if (pos != str_id.npos) {
1659 string nover(str_id, 0, pos);
1660 return FindSi(nover);
1661 }
1662 }
1663 return false;
1664 }
1665
1666
CSeqDBFileGiList(const string & fname,EIdType idtype)1667 CSeqDBFileGiList::CSeqDBFileGiList(const string & fname, EIdType idtype)
1668 {
1669 bool in_order = false;
1670 switch(idtype) {
1671 case eGiList:
1672 SeqDB_ReadGiList(fname, m_GisOids, & in_order);
1673 break;
1674 case eTiList:
1675 SeqDB_ReadTiList(fname, m_TisOids, & in_order);
1676 break;
1677 case eSiList:
1678 SeqDB_ReadSiList(fname, m_SisOids, & in_order, m_ListInfo);
1679 break;
1680 case eMixList:
1681 SeqDB_ReadMixList(fname, m_GisOids, m_TisOids, m_SisOids, & in_order);
1682 break;
1683 case ePigList:
1684 SeqDB_ReadPigList(fname, m_PigsOids, & in_order);
1685 break;
1686 case eTaxIdList:
1687 SeqDB_ReadTaxIdList(fname, m_TaxIdsOids);
1688 in_order = true;
1689 break;
1690 }
1691 m_CurrentOrder = in_order ? eGi : eNone;
1692 }
1693 /*
1694 CSeqDBFileGiList::CSeqDBFileGiList(vector<string> fnames, EIdType idtype)
1695 {
1696 bool in_order = false;
1697 switch(idtype) {
1698 case eGiList:
1699 case eTiList:
1700 NCBI_THROW(CSeqDBException,
1701 eArgErr,
1702 "Only multiple seqid list is supported.");
1703 case eSiList:
1704 ITERATE(vector<string>, iter, fnames) {
1705 SeqDB_ReadSiList(*iter, m_SisOids, & in_order);
1706 }
1707 break;
1708 case eMixList:
1709 ITERATE(vector<string>, iter, fnames) {
1710 SeqDB_ReadMixList(*iter, m_GisOids, m_TisOids, m_SisOids, & in_order);
1711 }
1712 break;
1713 }
1714 m_CurrentOrder = in_order ? eGi : eNone;
1715 }
1716 */
SeqDB_CombineAndQuote(const vector<string> & dbs,string & dbname)1717 void SeqDB_CombineAndQuote(const vector<string> & dbs,
1718 string & dbname)
1719 {
1720 int sz = 0;
1721
1722 for(unsigned i = 0; i < dbs.size(); i++) {
1723 sz += int(3 + dbs[i].size());
1724 }
1725
1726 dbname.reserve(sz);
1727
1728 for(unsigned i = 0; i < dbs.size(); i++) {
1729 if (dbname.size()) {
1730 dbname.append(" ");
1731 }
1732
1733 if (dbs[i].find(" ") != string::npos) {
1734 dbname.append("\"");
1735 dbname.append(dbs[i]);
1736 dbname.append("\"");
1737 } else {
1738 dbname.append(dbs[i]);
1739 }
1740 }
1741 }
1742
1743
SeqDB_SplitQuoted(const string & dbname,vector<CTempString> & dbs,bool keep_quote)1744 void SeqDB_SplitQuoted(const string & dbname,
1745 vector<CTempString> & dbs,
1746 bool keep_quote)
1747 {
1748 vector<CSeqDB_Substring> subs;
1749
1750 SeqDB_SplitQuoted(dbname, subs, keep_quote);
1751
1752 dbs.resize(0);
1753 dbs.reserve(subs.size());
1754
1755 ITERATE(vector<CSeqDB_Substring>, iter, subs) {
1756 CTempString tmp(iter->GetBegin(), iter->Size());
1757 dbs.push_back(tmp);
1758 }
1759 }
1760
1761
SeqDB_SplitQuoted(const string & dbname,vector<CSeqDB_Substring> & dbs,bool keep_quote)1762 void SeqDB_SplitQuoted(const string & dbname,
1763 vector<CSeqDB_Substring> & dbs,
1764 bool keep_quote)
1765 {
1766 // split names
1767
1768 const char * sp = dbname.data();
1769
1770 bool quoted = false;
1771 unsigned begin = 0;
1772
1773 for(unsigned i = 0; i < dbname.size(); i++) {
1774 char ch = dbname[i];
1775
1776 if (quoted) {
1777 // Quoted mode sees '"' as the only actionable token.
1778 if (ch == '"') {
1779 if (begin < i) {
1780 if(keep_quote) i++;
1781 dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1782 }
1783 begin = i + 1;
1784 quoted = false;
1785 }
1786 } else {
1787 // Non-quote mode: Space or quote starts the next string.
1788
1789 if (ch == ' ') {
1790 if (begin < i) {
1791 dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1792 }
1793 begin = i + 1;
1794 } else if (ch == '"') {
1795 if (begin < i) {
1796 dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1797 }
1798 begin = keep_quote ? i : i + 1;
1799 quoted = true;
1800 }
1801 }
1802 }
1803
1804 if (begin < dbname.size()) {
1805 dbs.push_back(CSeqDB_Substring(sp + begin, sp + dbname.size()));
1806 }
1807 }
1808
1809
CIntersectionGiList(CSeqDBGiList & gilist,vector<TGi> & gis)1810 CIntersectionGiList::CIntersectionGiList(CSeqDBGiList & gilist, vector<TGi> & gis)
1811 {
1812 _ASSERT(this != & gilist);
1813
1814 gilist.InsureOrder(CSeqDBGiList::eGi);
1815 sort(gis.begin(), gis.end());
1816
1817 int list_i = 0;
1818 int list_n = gilist.GetNumGis();
1819 int gis_i = 0;
1820 int gis_n = (int) gis.size();
1821
1822 while(list_i < list_n && gis_i < gis_n) {
1823 TGi L = gilist.GetGiOid(list_i).gi;
1824 TGi G = gis[gis_i];
1825
1826 if (L < G) {
1827 list_i ++;
1828 continue;
1829 }
1830
1831 if (L > G) {
1832 gis_i ++;
1833 continue;
1834 }
1835
1836 m_GisOids.push_back(gilist.GetGiOid(list_i));
1837
1838 list_i++;
1839 gis_i++;
1840 }
1841
1842 m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1843 }
1844
1845
CIntersectionGiList(CSeqDBNegativeList & neg_gilist,vector<TGi> & gis)1846 CIntersectionGiList::CIntersectionGiList(CSeqDBNegativeList & neg_gilist, vector<TGi> & gis)
1847 {
1848 neg_gilist.InsureOrder();
1849 sort(gis.begin(), gis.end());
1850
1851 int list_i = 0;
1852 int list_n = neg_gilist.GetNumGis();
1853 int gis_i = 0;
1854 int gis_n = (int) gis.size();
1855
1856 while(list_i < list_n && gis_i < gis_n) {
1857 TGi L = neg_gilist.GetGi(list_i);
1858 TGi G = gis[gis_i];
1859
1860 if (L < G) {
1861 list_i ++;
1862 continue;
1863 }
1864
1865 if (L > G) {
1866 m_GisOids.push_back(gis[gis_i]);
1867 gis_i ++;
1868 continue;
1869 }
1870
1871 list_i++;
1872
1873 TGi last_gi = gis[gis_i];
1874 do { gis_i++; } while (gis_i < gis_n && gis[gis_i] == last_gi);
1875 }
1876
1877 // push all the remaining vector gi's if any left
1878 while (gis_i < gis_n) {
1879 m_GisOids.push_back(gis[gis_i++]);
1880 }
1881
1882 m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1883 }
1884
1885
CSeqDBIdSet(const vector<Int4> & ids,EIdType t,bool positive)1886 CSeqDBIdSet::CSeqDBIdSet(const vector<Int4> & ids, EIdType t, bool positive)
1887 : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1888 {
1889 x_SortAndUnique(m_Ids->Set());
1890 }
1891
CSeqDBIdSet(const vector<Int8> & ids,EIdType t,bool positive)1892 CSeqDBIdSet::CSeqDBIdSet(const vector<Int8> & ids, EIdType t, bool positive)
1893 : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1894 {
1895 x_SortAndUnique(m_Ids->Set());
1896 }
1897
CSeqDBIdSet(const vector<Uint8> & ids,EIdType t,bool positive)1898 CSeqDBIdSet::CSeqDBIdSet(const vector<Uint8> & ids, EIdType t, bool positive)
1899 : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1900 {
1901 x_SortAndUnique(m_Ids->Set());
1902 }
1903
1904 #ifdef NCBI_STRICT_GI
CSeqDBIdSet(const vector<TGi> & ids,EIdType t,bool positive)1905 CSeqDBIdSet::CSeqDBIdSet(const vector<TGi> & ids, EIdType t, bool positive)
1906 : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1907 {
1908 x_SortAndUnique(m_Ids->Set());
1909 }
1910 #endif
1911
CSeqDBIdSet(const vector<string> & ids,EIdType t,bool positive)1912 CSeqDBIdSet::CSeqDBIdSet(const vector<string> & ids, EIdType t, bool positive)
1913 : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1914 {
1915 x_SortAndUnique(m_Ids->SetSeqIDs());
1916 }
1917
x_SortAndUnique(vector<Int8> & ids)1918 void CSeqDBIdSet::x_SortAndUnique(vector<Int8> & ids)
1919 {
1920 sort(ids.begin(), ids.end());
1921 ids.erase(unique(ids.begin(), ids.end()), ids.end());
1922 }
1923
1924
x_SortAndUnique(vector<string> & ids)1925 void CSeqDBIdSet::x_SortAndUnique(vector<string> & ids)
1926 {
1927 sort(ids.begin(), ids.end());
1928 ids.erase(unique(ids.begin(), ids.end()), ids.end());
1929 }
1930
Negate()1931 void CSeqDBIdSet::Negate()
1932 {
1933 m_Positive = ! m_Positive;
1934 }
1935
1936 void CSeqDBIdSet::
x_SummarizeBooleanOp(EOperation op,bool A_pos,bool B_pos,bool & result_pos,bool & incl_A,bool & incl_B,bool & incl_AB)1937 x_SummarizeBooleanOp(EOperation op,
1938 bool A_pos,
1939 bool B_pos,
1940 bool & result_pos,
1941 bool & incl_A,
1942 bool & incl_B,
1943 bool & incl_AB)
1944 {
1945 typedef CSeqDBIdSet TIdList;
1946
1947 incl_A = incl_B = incl_AB = false;
1948
1949 // Each binary boolean function can be represented as a 4 bit
1950 // descriptor. The four bits indicate whether the result is true
1951 // when it appears, respectively, in neither list, only the second
1952 // list, only the first list, or in both lists. For example, the
1953 // operation (A AND B) can be represented as (0001), and (A OR !B)
1954 // can be written as (1011). In a positive ID list, 1 means that
1955 // an ID should be included in database iteration.
1956
1957 // But 4-bit descriptors starting with a '1' correspond to logical
1958 // operations that include all IDs not appearing in either input
1959 // set. But of course we do not have access to the IDs that do
1960 // not appear, so we cannot (feasibly) compute such operations.
1961
1962 // To solve this problem, De Morgan's Laws are used to transform
1963 // the operation into its inverse, the results of which can be
1964 // applied to SeqDB as a negative ID list.
1965
1966 // For our purposes, these three transforms are needed:
1967 //
1968 // 1. (!X and !Y) becomes !(X or Y)
1969 // 2. (!X or Y) becomes !(X and !Y)
1970 // 3. (X or !Y) becomes !(!X and Y)
1971
1972 result_pos = true;
1973
1974 switch(op) {
1975 case eAnd:
1976 if ((! A_pos) && (! B_pos)) {
1977 op = TIdList::eOr;
1978 result_pos = false;
1979 A_pos = B_pos = true;
1980 }
1981 break;
1982
1983 case eOr:
1984 if ((! A_pos) || (! B_pos)) {
1985 op = TIdList::eAnd;
1986 result_pos = false;
1987 A_pos = ! A_pos;
1988 B_pos = ! B_pos;
1989 }
1990 break;
1991
1992 case eXor:
1993 result_pos = A_pos == B_pos;
1994 break;
1995
1996 default:
1997 break;
1998 }
1999
2000 // Once we have a legal operation, we construct these flags to
2001 // summarize the boolean operation. (Each of these corresponds to
2002 // one of the bits in the 4-bit descriptor.)
2003
2004 switch(op) {
2005 case eAnd:
2006 _ASSERT(A_pos || B_pos);
2007 incl_A = !B_pos;
2008 incl_B = !A_pos;
2009 incl_AB = A_pos && B_pos;
2010 break;
2011
2012 case eOr:
2013 _ASSERT(A_pos || B_pos);
2014 incl_A = incl_B = incl_AB = true;
2015 break;
2016
2017 case eXor:
2018 incl_AB = (A_pos != B_pos);
2019 incl_A = incl_B = ! incl_AB;
2020 break;
2021
2022 default:
2023 break;
2024 }
2025 }
2026
2027 void CSeqDBIdSet::
x_BooleanSetOperation(EOperation op,const vector<Int8> & A,bool A_pos,const vector<Int8> & B,bool B_pos,vector<Int8> & result,bool & result_pos)2028 x_BooleanSetOperation(EOperation op,
2029 const vector<Int8> & A,
2030 bool A_pos,
2031 const vector<Int8> & B,
2032 bool B_pos,
2033 vector<Int8> & result,
2034 bool & result_pos)
2035 {
2036 bool incl_A(false),
2037 incl_B(false),
2038 incl_AB(false);
2039
2040 x_SummarizeBooleanOp(op,
2041 A_pos,
2042 B_pos,
2043 result_pos,
2044 incl_A,
2045 incl_B,
2046 incl_AB);
2047
2048 size_t A_i(0), B_i(0);
2049
2050 while((A_i < A.size()) && (B_i < B.size())) {
2051 Int8 Ax(A[A_i]), Bx(B[B_i]), target(-1);
2052 bool included(false);
2053
2054 if (Ax < Bx) {
2055 ++ A_i;
2056 target = Ax;
2057 included = incl_A;
2058 } else if (Ax > Bx) {
2059 ++ B_i;
2060 target = Bx;
2061 included = incl_B;
2062 } else {
2063 ++ A_i;
2064 ++ B_i;
2065 target = Ax;
2066 included = incl_AB;
2067 }
2068
2069 if (included) {
2070 result.push_back(target);
2071 }
2072 }
2073
2074 if (incl_A) {
2075 while(A_i < A.size()) {
2076 result.push_back(A[A_i++]);
2077 }
2078 }
2079
2080 if (incl_B) {
2081 while(B_i < B.size()) {
2082 result.push_back(B[B_i++]);
2083 }
2084 }
2085 }
2086
Compute(EOperation op,const vector<Int4> & ids,bool positive)2087 void CSeqDBIdSet::Compute(EOperation op,
2088 const vector<Int4> & ids,
2089 bool positive)
2090 {
2091 CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2092
2093 CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2094
2095 x_SortAndUnique(B->Set());
2096
2097 bool result_pos(true);
2098
2099 x_BooleanSetOperation(op,
2100 m_Ids->Set(),
2101 m_Positive,
2102 B->Set(),
2103 positive,
2104 result->Set(),
2105 result_pos);
2106
2107 m_Positive = result_pos;
2108 m_Ids = result;
2109 }
2110
Compute(EOperation op,const vector<Int8> & ids,bool positive)2111 void CSeqDBIdSet::Compute(EOperation op,
2112 const vector<Int8> & ids,
2113 bool positive)
2114 {
2115 CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2116
2117 CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2118 x_SortAndUnique(B->Set());
2119
2120 bool result_pos(true);
2121
2122 x_BooleanSetOperation(op,
2123 m_Ids->Set(),
2124 m_Positive,
2125 B->Set(),
2126 positive,
2127 result->Set(),
2128 result_pos);
2129
2130 m_Positive = result_pos;
2131 m_Ids = result;
2132 }
2133
Compute(EOperation op,const vector<Uint8> & ids,bool positive)2134 void CSeqDBIdSet::Compute(EOperation op,
2135 const vector<Uint8> & ids,
2136 bool positive)
2137 {
2138 CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2139
2140 CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2141 x_SortAndUnique(B->Set());
2142
2143 bool result_pos(true);
2144
2145 x_BooleanSetOperation(op,
2146 m_Ids->Set(),
2147 m_Positive,
2148 B->Set(),
2149 positive,
2150 result->Set(),
2151 result_pos);
2152
2153 m_Positive = result_pos;
2154 m_Ids = result;
2155 }
2156
Compute(EOperation op,const CSeqDBIdSet & ids)2157 void CSeqDBIdSet::Compute(EOperation op, const CSeqDBIdSet & ids)
2158 {
2159 if (m_IdType != ids.m_IdType ) {
2160 NCBI_THROW(CSeqDBException,
2161 eArgErr,
2162 "Set operation requested but ID types don't match.");
2163 }
2164
2165 CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2166 bool result_pos(true);
2167
2168 x_BooleanSetOperation(op,
2169 m_Ids->Set(),
2170 m_Positive,
2171 ids.m_Ids->Get(),
2172 ids.m_Positive,
2173 result->Set(),
2174 result_pos);
2175
2176 m_Positive = result_pos;
2177 m_Ids = result;
2178 }
2179
GetPositiveList()2180 CRef<CSeqDBGiList> CSeqDBIdSet::GetPositiveList()
2181 {
2182 CRef<CSeqDBGiList> ids(new CSeqDBGiList);
2183
2184 if (! m_Positive) {
2185 NCBI_THROW(CSeqDBException,
2186 eFileErr,
2187 "Positive ID list requested but only negative exists.");
2188 }
2189
2190 if (m_IdType == eTi) {
2191 ids->ReserveTis(m_Ids->Size());
2192
2193 ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2194 ids->AddTi(*iter);
2195 }
2196 } else {
2197 ids->ReserveGis(m_Ids->Size());
2198
2199 ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2200 _ASSERT(((*iter) >> 32) == 0);
2201 ids->AddGi(GI_FROM(Int8, *iter));
2202 }
2203 }
2204
2205 return ids;
2206 }
2207
GetNegativeList()2208 CRef<CSeqDBNegativeList> CSeqDBIdSet::GetNegativeList()
2209 {
2210 if (m_Positive) {
2211 NCBI_THROW(CSeqDBException,
2212 eFileErr,
2213 "Negative ID list requested but only positive exists.");
2214 }
2215
2216 CRef<CSeqDBNegativeList> ids(new CSeqDBNegativeList);
2217
2218 if (m_IdType == eTi) {
2219 ids->ReserveTis(m_Ids->Size());
2220
2221 ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2222 ids->AddTi(*iter);
2223 }
2224 } else if (m_IdType == eGi) {
2225 ids->ReserveGis(m_Ids->Size());
2226
2227 ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2228 _ASSERT(((*iter) >> 32) == 0);
2229 ids->AddGi(GI_FROM(Int8, *iter));
2230 }
2231 }
2232 else {
2233 ids->ReserveSis(m_Ids->Size());
2234
2235 ITERATE(vector<string>, iter, m_Ids->SetSeqIDs()) {
2236 ids->AddSi(*iter);
2237 }
2238 }
2239
2240 return ids;
2241 }
2242
CSeqDBIdSet()2243 CSeqDBIdSet::CSeqDBIdSet()
2244 : m_Positive (false),
2245 m_IdType (eGi),
2246 m_Ids (new CSeqDBIdSet_Vector)
2247 {
2248 }
2249
Blank() const2250 bool CSeqDBIdSet::Blank() const
2251 {
2252 return (! m_Positive) && (0 == m_Ids->Size());
2253 }
2254
SeqDB_FileIntegrityAssert(const string & file,int line,const string & text)2255 void SeqDB_FileIntegrityAssert(const string & file,
2256 int line,
2257 const string & text)
2258 {
2259 string msg = "Validation failed: [" + text + "] at ";
2260 msg += file + ":" + NStr::IntToString(line);
2261 SeqDB_ThrowException(CSeqDBException::eFileErr, msg);
2262 }
2263
SeqDB_SimplifySeqid(CSeq_id & bestid,const string * acc,Int8 & num_id,string & str_id,bool & simpler)2264 ESeqDBIdType SeqDB_SimplifySeqid(CSeq_id & bestid,
2265 const string * acc,
2266 Int8 & num_id,
2267 string & str_id,
2268 bool & simpler)
2269 {
2270 ESeqDBIdType result = eStringId;
2271
2272 const CTextseq_id * tsip = 0;
2273
2274 bool matched = true;
2275
2276 switch(bestid.Which()) {
2277 case CSeq_id::e_Gi:
2278 simpler = true;
2279 num_id = GI_TO(Int8, bestid.GetGi());
2280 result = eGiId;
2281 break;
2282
2283 case CSeq_id::e_Gibbsq: /* gibbseq */
2284 simpler = true;
2285 result = eStringId;
2286 str_id = NStr::UIntToString(bestid.GetGibbsq());
2287 break;
2288
2289 case CSeq_id::e_General:
2290 {
2291 const CDbtag & dbt = bestid.GetGeneral();
2292
2293 if (dbt.CanGetDb()) {
2294 if (dbt.GetDb() == "BL_ORD_ID") {
2295 simpler = true;
2296 num_id = dbt.GetTag().GetId();
2297 result = eOID;
2298 break;
2299 }
2300
2301 if (dbt.GetDb() == "PIG") {
2302 simpler = true;
2303 num_id = dbt.GetTag().GetId();
2304 result = ePigId;
2305 break;
2306 }
2307
2308 if (dbt.GetDb() == "ti") {
2309 simpler = true;
2310 num_id = (dbt.GetTag().IsStr()
2311 ? NStr::StringToInt8(dbt.GetTag().GetStr())
2312 : dbt.GetTag().GetId());
2313
2314 result = eTiId;
2315 break;
2316 }
2317
2318
2319 if (NStr::CompareNocase(dbt.GetDb(), "GNOMON") == 0) {
2320 str_id = bestid.AsFastaString();
2321 str_id = NStr::ToLower(str_id);
2322 result = eStringId;
2323 break;
2324 }
2325 }
2326
2327 if (dbt.CanGetTag() && dbt.GetTag().IsStr()) {
2328 result = eStringId;
2329 str_id = dbt.GetTag().GetStr();
2330 str_id = NStr::ToLower(str_id);
2331 } else {
2332 // Use the default logic.
2333 matched = false;
2334 }
2335 }
2336 break;
2337
2338 case CSeq_id::e_Local: /* local */
2339 simpler = true;
2340 result = eStringId;
2341 {
2342 const CObject_id & objid = bestid.GetLocal();
2343
2344 if (objid.IsStr()) {
2345 // sparse version will leave "lcl|" off.
2346 str_id = objid.GetStr();
2347 str_id = NStr::ToLower(str_id);
2348 } else {
2349 // Local numeric ids are stored as strings.
2350 str_id = "lcl|" + NStr::IntToString(objid.GetId());
2351 }
2352 }
2353 break;
2354
2355 // tsip types
2356
2357 case CSeq_id::e_Embl: /* embl */
2358 case CSeq_id::e_Ddbj: /* ddbj */
2359 case CSeq_id::e_Genbank: /* genbank */
2360 case CSeq_id::e_Tpg: /* Third Party Annot/Seq Genbank */
2361 case CSeq_id::e_Tpe: /* Third Party Annot/Seq EMBL */
2362 case CSeq_id::e_Tpd: /* Third Party Annot/Seq DDBJ */
2363 case CSeq_id::e_Other: /* other */
2364 case CSeq_id::e_Swissprot: /* swissprot (now with versions) */
2365 case CSeq_id::e_Gpipe: /* internal NCBI genome pipeline */
2366 tsip = bestid.GetTextseq_Id();
2367 break;
2368
2369 case CSeq_id::e_Pir: /* pir */
2370 case CSeq_id::e_Prf: /* prf */
2371 tsip = bestid.GetTextseq_Id();
2372 break;
2373
2374 default:
2375 matched = false;
2376 }
2377
2378 // Default: if we have a string, use it; if we only have seqid,
2379 // create a string. This should not happen if the seqid matches
2380 // one of the cases above, which currently correspond to all the
2381 // supported seqid types.
2382
2383 CSeq_id::ELabelFlags label_flags = (CSeq_id::ELabelFlags)
2384 (CSeq_id::fLabel_GeneralDbIsContent | CSeq_id::fLabel_Version);
2385
2386 if (! matched) {
2387 // (should not happen normally)
2388
2389 simpler = false;
2390 result = eStringId;
2391
2392 if (acc) {
2393 str_id = *acc;
2394 str_id = NStr::ToLower(str_id);
2395 } else {
2396 bestid.GetLabel(& str_id, CSeq_id::eFasta, label_flags);
2397 str_id = NStr::ToLower(str_id);
2398 }
2399 }
2400
2401 if (tsip) {
2402 bool found = false;
2403
2404 if (tsip->CanGetAccession()) {
2405 str_id = tsip->GetAccession();
2406 str_id = NStr::ToLower(str_id);
2407 found = true;
2408
2409 if (tsip->CanGetVersion()) {
2410 str_id += ".";
2411 str_id += NStr::UIntToString(tsip->GetVersion());
2412 }
2413 } else if (tsip->CanGetName()) {
2414 str_id = tsip->GetName();
2415 str_id = NStr::ToLower(str_id);
2416 found = true;
2417 }
2418
2419 if (found) {
2420 simpler = true;
2421 result = eStringId;
2422 }
2423 }
2424
2425 return result;
2426 }
2427
2428 /// Find the end of a single element in a Seq-id set
2429 ///
2430 /// Seq-id strings sometimes contain several Seq-ids. This function
2431 /// looks for the end of the first Seq-id, and will return its length.
2432 /// Static methods of CSeq_id are used to evaluate tokens.
2433 ///
2434 /// @param str
2435 /// Seq-id string to search.
2436 /// @param pos
2437 /// Position at which to start search.
2438 /// @return
2439 /// End position of first fasta id, or string::npos in case of error.
2440
2441 static size_t
s_SeqDB_EndOfFastaID(const string & str,size_t pos)2442 s_SeqDB_EndOfFastaID(const string & str, size_t pos)
2443 {
2444 // (Derived from s_EndOfFastaID()).
2445
2446 size_t vbar = str.find('|', pos);
2447
2448 if (vbar == string::npos) {
2449 return string::npos; // bad
2450 }
2451
2452 string portion(str, pos, vbar - pos);
2453
2454 CSeq_id::E_Choice choice =
2455 CSeq_id::WhichInverseSeqId(portion.c_str());
2456
2457 if (choice != CSeq_id::e_not_set) {
2458 size_t vbar_prev = vbar;
2459 int count;
2460 for (count=0; ; ++count, vbar_prev = vbar) {
2461 vbar = str.find('|', vbar_prev + 1);
2462
2463 if (vbar == string::npos) {
2464 break;
2465 }
2466
2467 int start_pt = int(vbar_prev + 1);
2468 string element(str, start_pt, vbar - start_pt);
2469
2470 choice = CSeq_id::WhichInverseSeqId(element.c_str());
2471
2472 if (choice != CSeq_id::e_not_set) {
2473 vbar = vbar_prev;
2474 break;
2475 }
2476 }
2477 } else {
2478 return string::npos; // bad
2479 }
2480
2481 return (vbar == string::npos) ? str.size() : vbar;
2482 }
2483
2484 /// Parse string into a sequence of Seq-id objects.
2485 ///
2486 /// A string is broken down into Seq-ids and the set of Seq-ids is
2487 /// returned.
2488 ///
2489 /// @param line
2490 /// The string to interpret.
2491 /// @param seqids
2492 /// The returned set of Seq-id objects.
2493 /// @return
2494 /// true if any Seq-id objects were found.
2495
2496 static bool
s_SeqDB_ParseSeqIDs(const string & line,vector<CRef<CSeq_id>> & seqids)2497 s_SeqDB_ParseSeqIDs(const string & line,
2498 vector< CRef< CSeq_id > > & seqids)
2499 {
2500 // (Derived from s_ParseFastaDefline()).
2501
2502 seqids.clear();
2503 size_t pos = 0;
2504
2505 while (pos < line.size()) {
2506 size_t end = s_SeqDB_EndOfFastaID(line, pos);
2507
2508 if (end == string::npos) {
2509 // We didn't get a clean parse -- ignore the data after
2510 // this point, and return what we have.
2511 break;
2512 }
2513
2514 string element(line, pos, end - pos);
2515
2516 CRef<CSeq_id> id;
2517
2518 try {
2519 id = new CSeq_id(element);
2520 }
2521 catch(invalid_argument &) {
2522 // Maybe this should be done: "seqids.clear();"
2523 break;
2524 }
2525
2526 seqids.push_back(id);
2527 pos = end + 1;
2528 }
2529
2530 return ! seqids.empty();
2531 }
2532
2533
2534
SeqDB_SimplifyAccession(const string & acc,Int8 & num_id,string & str_id,bool & simpler)2535 ESeqDBIdType SeqDB_SimplifyAccession(const string & acc,
2536 Int8 & num_id,
2537 string & str_id,
2538 bool & simpler)
2539 {
2540 ESeqDBIdType result = eStringId;
2541 num_id = (Uint4)-1;
2542
2543 vector< CRef< CSeq_id > > seqid_set;
2544
2545 if (s_SeqDB_ParseSeqIDs(acc, seqid_set)) {
2546 // Something like SeqIdFindBest()
2547 CRef<CSeq_id> bestid =
2548 FindBestChoice(seqid_set, CSeq_id::BestRank);
2549
2550 result = SeqDB_SimplifySeqid(*bestid, & acc, num_id, str_id, simpler);
2551 } else {
2552
2553 // Check for bare pdb accession with underscore (such as 12AS_A). These
2554 // are not in the isam index and need to be translated to the
2555 // standard form (pdb|12AS|A).
2556 list< CRef<CSeq_id> > seqids;
2557 try {
2558 CSeq_id::ParseFastaIds(seqids, acc, false);
2559 }
2560 catch (...) {
2561 seqids.clear();
2562 }
2563
2564 if (!seqids.empty() && seqids.front()->IsPdb() &&
2565 acc.find("_") != string::npos) {
2566
2567 str_id = seqids.front()->AsFastaString();
2568 str_id = NStr::ToLower(str_id);
2569 }
2570 else if (!seqids.empty() && seqids.front()->IsLocal()) {
2571 // Chec for gnl dbs
2572 if( acc.find(":") != string::npos) {
2573 static const char* GNL_DBs[] = {"CDD", "SRA", "TSA", "GNOMON", NULL};
2574 string db_tag, gnl_id;
2575 NStr::SplitInTwo(acc, ":", db_tag, gnl_id);
2576 const char** p = GNL_DBs;
2577 for (; p && *p; ++p) {
2578 if(NStr::EqualNocase(*p, db_tag.c_str())) {
2579 str_id = "gnl|" + db_tag + "|" + gnl_id;
2580 seqids.front().Reset();
2581 CRef<CSeq_id> new_id(new CSeq_id(str_id));
2582 seqids.front() = new_id;
2583 break;
2584 }
2585 }
2586 if(*p == NULL) {
2587 str_id = acc;
2588 }
2589 }
2590 else {
2591 Int8 n_id = 0;
2592 if (NStr::StringToNumeric<Int8>(acc,&n_id, NStr::fConvErr_NoThrow)) {
2593 str_id = "lcl|" + acc;
2594 }
2595 else {
2596 str_id = acc;
2597 }
2598 }
2599 }
2600 else {
2601 str_id = acc;
2602 }
2603 result = eStringId;
2604 simpler = false;
2605 }
2606
2607 return result;
2608 }
2609
SeqDB_SimplifyAccession(const string & acc)2610 const string SeqDB_SimplifyAccession(const string &acc)
2611 {
2612 Int8 num_id;
2613 string str_id;
2614 bool simpler(false);
2615 ESeqDBIdType result = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
2616 if (result == eStringId) return str_id;
2617 else return "";
2618 }
2619
SeqDB_GetFileExtensions(bool db_is_protein,vector<string> & extn,EBlastDbVersion dbver)2620 void SeqDB_GetFileExtensions(bool db_is_protein, vector<string>& extn, EBlastDbVersion dbver)
2621 {
2622 // NOTE: If more extensions are added, please keep in sync with
2623 // updatedb.pl's DistributeBlastDbsToBackends
2624 // and Blast.pm's @blastdb_extensions
2625 extn.clear();
2626
2627 const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2628
2629 extn.push_back(kExtnMol + "al"); // alias file
2630 extn.push_back(kExtnMol + "in"); // index file
2631 extn.push_back(kExtnMol + "hr"); // header file
2632 extn.push_back(kExtnMol + "sq"); // sequence file
2633 extn.push_back(kExtnMol + "ni"); // ISAM numeric index file
2634 extn.push_back(kExtnMol + "nd"); // ISAM numeric data file
2635 if (dbver == eBDB_Version4) {
2636 extn.push_back(kExtnMol + "si"); // ISAM string index file
2637 extn.push_back(kExtnMol + "sd"); // ISAM string data file
2638 }
2639 extn.push_back(kExtnMol + "pi"); // ISAM PIG index file
2640 extn.push_back(kExtnMol + "pd"); // ISAM PIG data file
2641 if (dbver == eBDB_Version5) {
2642 vector<string> lmdbs;
2643 SeqDB_GetLMDBFileExtensions(db_is_protein, lmdbs);
2644 extn.insert(extn.end(), lmdbs.begin(), lmdbs.end());
2645 }
2646 // Contain masking information
2647 extn.push_back(kExtnMol + "aa"); // ISAM mask index file
2648 extn.push_back(kExtnMol + "ab"); // ISAM mask data file (big-endian)
2649 extn.push_back(kExtnMol + "ac"); // ISAM mask data file (little-endian)
2650 extn.push_back(kExtnMol + "og"); // OID to GI file
2651 extn.push_back(kExtnMol + "hi"); // ISAM sequence hash index file
2652 extn.push_back(kExtnMol + "hd"); // ISAM sequence hash data file
2653 extn.push_back(kExtnMol + "ti"); // ISAM trace id index file
2654 extn.push_back(kExtnMol + "td"); // ISAM trace id data file
2655 }
2656
2657
SeqDB_GetLMDBFileExtensions(bool db_is_protein,vector<string> & extn)2658 void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector<string>& extn)
2659 {
2660
2661 static const char * ext[]={"db", "os", "ot", "tf", "to", "db-lock", "tf-lock", NULL};
2662 extn.clear();
2663 const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2664 for(const char ** p=ext; *p != NULL; p++) {
2665 extn.push_back(kExtnMol + (*p));
2666 }
2667 }
2668
IsStringId(const CSeq_id & id)2669 bool IsStringId(const CSeq_id & id)
2670 {
2671 switch(id.Which()) {
2672 case CSeq_id::e_Gi:
2673 return false;
2674 break;
2675 case CSeq_id::e_General:
2676 {
2677 const CDbtag & dbt = id.GetGeneral();
2678 if (dbt.CanGetDb() && (dbt.GetDb() == "PIG")) {
2679 return false;
2680 }
2681 }
2682 default:
2683 return true;
2684 break;
2685 };
2686 }
2687
GetBlastSeqIdString(const CSeq_id & seqid,bool version)2688 string GetBlastSeqIdString(const CSeq_id & seqid, bool version)
2689 {
2690 if(seqid.IsPir() || seqid.IsPrf()) {
2691 return seqid.AsFastaString();
2692 }
2693
2694 return seqid.GetSeqIdString(version);
2695 }
2696
2697 END_NCBI_SCOPE
2698
2699