1 /*  $Id: seqdbcommon.cpp 631511 2021-05-19 13:47:54Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbcommon.cpp
31 /// Definitions of various helper functions for SeqDB.
32 #include <ncbi_pch.hpp>
33 #include <corelib/metareg.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
37 #include <util/sequtil/sequtil.hpp>
38 #include <util/sequtil/sequtil_convert.hpp>
39 #include <objects/seq/seq__.hpp>
40 #include <objects/general/general__.hpp>
41 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
42 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
43 #include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
44 #include <algorithm>
45 
46 BEGIN_NCBI_SCOPE
47 
48 const string kSeqDBGroupAliasFileName("index.alx");
49 
SeqDB_RemoveDirName(CSeqDB_Substring s)50 CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
51 {
52     int off = s.FindLastOf(CFile::GetPathSeparator());
53 
54     if (off != -1) {
55         s.EraseFront(off + 1);
56     }
57 
58     return s;
59 }
60 
61 
SeqDB_RemoveFileName(CSeqDB_Substring s)62 CSeqDB_Substring SeqDB_RemoveFileName(CSeqDB_Substring s)
63 {
64     int off = s.FindLastOf(CFile::GetPathSeparator());
65 
66     if (off != -1) {
67         s.Resize(off);
68     } else {
69         s.Clear();
70     }
71 
72     return s;
73 }
74 
75 
SeqDB_RemoveExtn(CSeqDB_Substring s)76 CSeqDB_Substring SeqDB_RemoveExtn(CSeqDB_Substring s)
77 {
78     // This used to remove anything after the last "." it could find.
79     // Then it was changed to only remove the part after the ".", if
80     // it did not contain a "/" character.
81 
82     // Now it has been made even stricter, it looks for something like
83     // "(.*)([.][a-zA-Z]{3})" and removes the second sub-expression if
84     // there is a match.  This is because of mismatches like "1234.00"
85     // that are not real "file extensions" in the way that SeqDB wants
86     // to process them.
87 
88     int slen = s.Size();
89 
90     if (slen > 4) {
91         string extn(s.GetEnd()-4, s.GetEnd());
92         string extn2(extn, 2, 4);
93         // Of course, nal and pal are not the only valid
94         // extensions, but this code is only used with these two,
95         // as far as I know, at this moment in time.
96 
97         if (extn[0] == '.'     &&
98             (extn[1] == 'n' || extn[1] == 'p') &&
99             (extn2 == "al"  || extn2 == "in")) {
100             /*
101             isalpha( s[slen-3] ) &&
102             isalpha( s[slen-2] ) &&
103             isalpha( s[slen-1] )) {*/
104 
105             s.Resize(slen - 4);
106         }
107     }
108 
109     return s;
110 }
111 
112 
SeqDB_SplitString(CSeqDB_Substring & buffer,CSeqDB_Substring & front,char delim)113 bool SeqDB_SplitString(CSeqDB_Substring & buffer,
114                        CSeqDB_Substring & front,
115                        char               delim)
116 {
117     for(int i = 0; i < buffer.Size(); i++) {
118         if (buffer[i] == delim) {
119             front = buffer;
120 
121             buffer.EraseFront(i + 1);
122             front.Resize(i);
123 
124             return true;
125         }
126     }
127     return false;
128 }
129 
130 
SeqDB_CombinePath(const CSeqDB_Substring & one,const CSeqDB_Substring & two,const CSeqDB_Substring * extn,string & outp)131 void SeqDB_CombinePath(const CSeqDB_Substring & one,
132                        const CSeqDB_Substring & two,
133                        const CSeqDB_Substring * extn,
134                        string                 & outp)
135 {
136     char delim = CFile::GetPathSeparator();
137 
138     int extn_amt = extn ? (extn->Size()+1) : 0;
139 
140     if (two.Empty()) {
141         // We only use the extension if there is a filename.
142         one.GetString(outp);
143         return;
144     }
145 
146     bool only_two = false;
147 
148     if (one.Empty() || two[0] == delim) {
149         only_two = true;
150     }
151 
152     // Drive letter test for CP/M derived systems
153     if (delim == '\\'   &&
154         two.Size() > 3  &&
155         isalpha(two[0]) &&
156         two[1] == ':'   &&
157         two[2] == '\\') {
158 
159         only_two = true;
160     }
161 
162     if (only_two) {
163         outp.reserve(two.Size() + extn_amt);
164         two.GetString(outp);
165 
166         if (extn) {
167             outp.append(".");
168             outp.append(extn->GetBegin(), extn->GetEnd());
169         }
170         return;
171     }
172 
173     outp.reserve(one.Size() + two.Size() + 1 + extn_amt);
174 
175     one.GetString(outp);
176 
177     if (outp[outp.size() - 1] != delim) {
178         outp += delim;
179     }
180 
181     outp.append(two.GetBegin(), two.GetEnd());
182 
183     if (extn) {
184         outp.append(".");
185         outp.append(extn->GetBegin(), extn->GetEnd());
186     }
187 }
188 
189 
SeqDB_CompareVolume(const string & s1,const string & s2)190 bool SeqDB_CompareVolume(const string & s1, const string & s2)
191 {
192     string x1, x2;
193     CSeqDB_Path(s1).FindBaseName().GetString(x1);
194     CSeqDB_Path(s2).FindBaseName().GetString(x2);
195     if (x1 != x2) return (x1 < x2);
196     else return (s1 < s2);
197 }
198 
199 /// File existence test interface.
200 class CSeqDB_FileExistence {
201 public:
202     /// Destructor
~CSeqDB_FileExistence()203     virtual ~CSeqDB_FileExistence()
204     {
205     }
206 
207     /// Check if file exists at fully qualified path.
208     /// @param fname Filename.
209     /// @return True if the file was found.
210     virtual bool DoesFileExist(const string & fname) = 0;
211 };
212 
213 
214 /// Test whether an index or alias file exists
215 ///
216 /// The provide filename is combined with both of the extensions
217 /// appropriate to the database sequence type, and the resulting
218 /// strings are checked for existence in the file system.  The
219 /// 'access' object defines how to check file existence.
220 ///
221 /// @param dbname
222 ///   Input path and filename
223 /// @param dbtype
224 ///   Database type, either protein or nucleotide
225 /// @param access
226 ///   The file access object.
227 /// @param linkoutdb_search
228 ///   Determines whether linkoutdb files should be searched for
229 /// @return
230 ///   true if either of the index or alias files is found
231 
s_SeqDB_DBExists(const string & dbname,char dbtype,CSeqDB_FileExistence & access,bool linkoutdb_search)232 static bool s_SeqDB_DBExists(const string         & dbname,
233                              char                   dbtype,
234                              CSeqDB_FileExistence & access,
235                              bool linkoutdb_search)
236 {
237     string path;
238     path.reserve(dbname.size() + 4);
239     path.assign(dbname.data(), dbname.data() + dbname.size());
240 
241     if (linkoutdb_search) {
242         _ASSERT(dbtype == 'p');
243         path.append(".sqlite3");
244         if (access.DoesFileExist(path)) {
245             return true;
246         }
247     } else {
248         path.append(".-al");
249 
250         path[path.size()-3] = dbtype;
251 
252         if (access.DoesFileExist(path)) {
253             return true;
254         }
255 
256         path[path.size()-2] = 'i';
257         path[path.size()-1] = 'n';
258 
259         if (access.DoesFileExist(path)) {
260             return true;
261         }
262     }
263 
264     return false;
265 }
266 
267 
268 /// Returns the character used to seperate path components in the
269 /// current operating system or platform.
s_GetPathSplitter()270 static string s_GetPathSplitter()
271 {
272     const char * splitter = 0;
273 
274 #if defined(NCBI_OS_UNIX)
275     splitter = ":";
276 #else
277     splitter = ";";
278 #endif
279 
280     return splitter;
281 }
282 
283 
SeqDB_ConvertOSPath(string & dbs)284 void SeqDB_ConvertOSPath(string & dbs)
285 {
286     // See also CDirEntry::ConvertToOSPath()
287 
288     char delim = CDirEntry::GetPathSeparator();
289 
290     for(size_t i = 0; i<dbs.size(); i++) {
291         if (dbs[i] == '/' || dbs[i] == '\\') {
292             dbs[i] = delim;
293         }
294     }
295 }
296 
297 
SeqDB_MakeOSPath(const string & dbs)298 string SeqDB_MakeOSPath(const string & dbs)
299 {
300     string cvt(dbs);
301     SeqDB_ConvertOSPath(cvt);
302     return cvt;
303 }
304 
305 
306 /// Search for a file in a provided set of paths
307 ///
308 /// This function takes a search path as a ":" delimited set of path
309 /// names, and searches in those paths for the given database
310 /// component.  The component name may include path components.  If
311 /// the exact flag is set, the path is assumed to contain any required
312 /// extension; otherwise extensions for index and alias files will be
313 /// tried.  Each element of the search path is tried in sequential
314 /// order for both index or alias files (if exact is not set), before
315 /// moving to the next element of the search path.  The path returned
316 /// from this function will not contain a file extension unless the
317 /// provided filename did (in which case, exact is normally set).
318 ///
319 /// @param blast_paths
320 ///   List of filesystem paths seperated by ":".
321 /// @param dbname
322 ///   Base name of the database index or alias file to search for.
323 /// @param dbtype
324 ///   Type of database, either protein or nucleotide.
325 /// @param exact
326 ///   Set to true if dbname already contains any needed extension.
327 /// @param linkoutdb_search
328 ///   Determines whether linkoutdb files should be searched for
329 /// @return
330 ///   Full pathname, minus extension, or empty string if none found.
331 
s_SeqDB_TryPaths(const string & blast_paths,const string & dbname,char dbtype,bool exact,CSeqDB_FileExistence & access,bool linkoutdb_search=false)332 static string s_SeqDB_TryPaths(const string         & blast_paths,
333                                const string         & dbname,
334                                char                   dbtype,
335                                bool                   exact,
336                                CSeqDB_FileExistence & access,
337                                bool                   linkoutdb_search = false)
338 {
339     // 1. If this was a vector<CSeqDB_Substring>, the tokenize would
340     //    not need to do any allocations (but would need rewriting).
341     //
342     // 2. If this was split into several functions, and/or a stateful
343     //    class was used, this would perform better here, and would
344     //    allow improvement of the search routine for combined group
345     //    indices (see comments in CSeqDBAliasSets::FindAliasPath).
346 
347     vector<string> roads;
348     NStr::Split(blast_paths, s_GetPathSplitter(), roads, NStr::fSplit_Tokenize);
349 
350     string result;
351     string attempt;
352 
353     ITERATE(vector<string>, road, roads) {
354         attempt.erase();
355 
356         SeqDB_CombinePath(CSeqDB_Substring(SeqDB_MakeOSPath(*road)),
357                           CSeqDB_Substring(dbname),
358                           0,
359                           attempt);
360 
361         if (exact) {
362             if (access.DoesFileExist(attempt)) {
363                 result = attempt;
364                 break;
365             }
366         } else {
367             if (s_SeqDB_DBExists(attempt, dbtype, access, linkoutdb_search)) {
368                 result = attempt;
369                 break;
370             }
371         }
372     }
373 
374     return result;
375 }
376 
377 static string
s_SeqDB_FindBlastDBPath(const string & dbname,char dbtype,string * sp,bool exact,CSeqDB_FileExistence & access,const string path="")378 s_SeqDB_FindBlastDBPath(const string         & dbname,
379                         char                   dbtype,
380                         string               * sp,
381                         bool                   exact,
382                         CSeqDB_FileExistence & access,
383                         const string           path="")
384 {
385     const string pathology = (path=="") ? CSeqDBAtlas::GenerateSearchPath() : path;
386 
387     if (sp) {
388         *sp = pathology;
389     }
390 
391     return s_SeqDB_TryPaths(pathology, dbname, dbtype, exact, access);
392 }
393 
394 /// Check file existence using CSeqDBAtlas.
395 class CSeqDB_AtlasAccessor : public CSeqDB_FileExistence {
396 public:
397     /// Constructor.
CSeqDB_AtlasAccessor(CSeqDBAtlas & atlas)398     CSeqDB_AtlasAccessor(CSeqDBAtlas    & atlas)
399         : m_Atlas  (atlas)
400     {
401     }
402 
403     /// Test file existence.
404     /// @param fname Fully qualified name of file for which to look.
405     /// @return True iff file exists.
DoesFileExist(const string & fname)406     virtual bool DoesFileExist(const string & fname)
407     {
408         return m_Atlas.DoesFileExist(fname);
409     }
410 
411 private:
412     CSeqDBAtlas    & m_Atlas;
413 };
414 
415 
SeqDB_FindBlastDBPath(const string & dbname,char dbtype,string * sp,bool exact,CSeqDBAtlas & atlas)416 string SeqDB_FindBlastDBPath(const string   & dbname,
417                              char             dbtype,
418                              string         * sp,
419                              bool             exact,
420                              CSeqDBAtlas    & atlas)
421 {
422     CSeqDB_AtlasAccessor access(atlas);
423 
424     return s_SeqDB_FindBlastDBPath(dbname,
425                                    dbtype,
426                                    sp,
427                                    exact,
428                                    access,
429                                    atlas.GetSearchPath());
430 }
431 
432 
433 /// Check file existence using CFile.
434 class CSeqDB_SimpleAccessor : public CSeqDB_FileExistence {
435 public:
436     /// Constructor.
CSeqDB_SimpleAccessor()437     CSeqDB_SimpleAccessor()
438     {
439     }
440 
441     /// Test file existence.
442     /// @param fname Fully qualified name of file for which to look.
443     /// @return True iff file exists.
DoesFileExist(const string & fname)444     virtual bool DoesFileExist(const string & fname)
445     {
446         // Use the same criteria as the Atlas code would.
447         CFile whole(SeqDB_MakeOSPath(fname));
448         return whole.GetLength() != (Int8) -1;
449     }
450 };
451 
452 
SeqDB_ResolveDbPath(const string & filename)453 string SeqDB_ResolveDbPath(const string & filename)
454 {
455     CSeqDB_SimpleAccessor access;
456 
457     return s_SeqDB_FindBlastDBPath(filename,
458                                    '-',
459                                    0,
460                                    true,
461                                    access);
462 }
463 
SeqDB_ResolveDbPathNoExtension(const string & filename,char dbtype)464 string SeqDB_ResolveDbPathNoExtension(const string & filename,
465                                       char dbtype /* = '-' */)
466 {
467     CSeqDB_SimpleAccessor access;
468 
469     return s_SeqDB_FindBlastDBPath(filename, dbtype, 0, false, access);
470 }
471 
SeqDB_ResolveDbPathForLinkoutDB(const string & filename)472 string SeqDB_ResolveDbPathForLinkoutDB(const string & filename)
473 {
474     const char dbtype('p'); // this is determined by blastdb_links application
475     CSeqDB_SimpleAccessor access;
476     const string pathology = CSeqDBAtlas::GenerateSearchPath();
477     return s_SeqDB_TryPaths(pathology, filename, dbtype, false, access, true);
478 }
479 
SeqDB_JoinDelim(string & a,const string & b,const string & delim)480 void SeqDB_JoinDelim(string & a, const string & b, const string & delim)
481 {
482     if (b.empty()) {
483         return;
484     }
485 
486     if (a.empty()) {
487         // a has no size - but might have capacity
488         s_SeqDB_QuickAssign(a, b);
489         return;
490     }
491 
492     size_t newlen = a.length() + b.length() + delim.length();
493 
494     if (a.capacity() < newlen) {
495         size_t newcap = 16;
496 
497         while(newcap < newlen) {
498             newcap <<= 1;
499         }
500 
501         a.reserve(newcap);
502     }
503 
504     a += delim;
505     a += b;
506 }
507 
508 
CSeqDBGiList()509 CSeqDBGiList::CSeqDBGiList()
510     : m_CurrentOrder(eNone)
511 {
512 }
513 
514 
515 /// Compare SGiOid structs by OID.
516 class CSeqDB_SortOidLessThan {
517 public:
518     /// Test whether lhs is less than (occurs before) rhs.
519     /// @param lhs Left hand side of less-than operator. [in]
520     /// @param rhs Right hand side of less-than operator. [in]
521     /// @return True if lhs has a lower OID than rhs.
operator ()(const CSeqDBGiList::SGiOid & lhs,const CSeqDBGiList::SGiOid & rhs)522     int operator()(const CSeqDBGiList::SGiOid & lhs,
523                    const CSeqDBGiList::SGiOid & rhs)
524     {
525         return lhs.oid < rhs.oid;
526     }
527 };
528 
529 
530 /// Compare SGiOid structs by GI.
531 class CSeqDB_SortGiLessThan {
532 public:
533     /// Test whether lhs is less than (occurs before) rhs.
534     /// @param lhs Left hand side of less-than operator. [in]
535     /// @param rhs Right hand side of less-than operator. [in]
536     /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::SGiOid & lhs,const CSeqDBGiList::SGiOid & rhs)537     int operator()(const CSeqDBGiList::SGiOid & lhs,
538                    const CSeqDBGiList::SGiOid & rhs)
539     {
540         return lhs.gi < rhs.gi;
541     }
542 };
543 
544 class CSeqDB_SortPigLessThan {
545 public:
546     /// Test whether lhs is less than (occurs before) rhs.
547     /// @param lhs Left hand side of less-than operator. [in]
548     /// @param rhs Right hand side of less-than operator. [in]
549     /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::SPigOid & lhs,const CSeqDBGiList::SPigOid & rhs)550     int operator()(const CSeqDBGiList::SPigOid & lhs,
551                    const CSeqDBGiList::SPigOid & rhs)
552     {
553         return lhs.pig < rhs.pig;
554     }
555 };
556 
557 
558 
559 /// Compare SGiOid structs by GI.
560 class CSeqDB_SortTiLessThan {
561 public:
562     /// Test whether lhs is less than (occurs before) rhs.
563     /// @param lhs Left hand side of less-than operator. [in]
564     /// @param rhs Right hand side of less-than operator. [in]
565     /// @return True if lhs has a lower GI than rhs.
operator ()(const CSeqDBGiList::STiOid & lhs,const CSeqDBGiList::STiOid & rhs)566     int operator()(const CSeqDBGiList::STiOid & lhs,
567                    const CSeqDBGiList::STiOid & rhs)
568     {
569         return lhs.ti < rhs.ti;
570     }
571 };
572 
573 
574 /// Compare SSeqIdOid structs by SeqId.
575 class CSeqDB_SortSiLessThan {
576 public:
577     /// Test whether lhs is less than (occurs before) rhs.
578     /// @param lhs Left hand side of less-than operator. [in]
579     /// @param rhs Right hand side of less-than operator. [in]
580     /// @return True if lhs sorts before rhs by Seq-id.
operator ()(const CSeqDBGiList::SSiOid & lhs,const CSeqDBGiList::SSiOid & rhs)581     int operator()(const CSeqDBGiList::SSiOid & lhs,
582                    const CSeqDBGiList::SSiOid & rhs)
583     {
584         return lhs.si < rhs.si;
585     }
586 };
587 
588 
589 template<class TCompare, class TVector>
s_InsureOrder(TVector & v)590 void s_InsureOrder(TVector & v)
591 {
592     bool already = true;
593 
594     TCompare compare_less;
595 
596     for(int i = 1; i < (int) v.size(); i++) {
597         if (compare_less(v[i], v[i-1])) {
598             already = false;
599             break;
600         }
601     }
602 
603     if (! already) {
604         sort(v.begin(), v.end(), compare_less);
605     }
606 }
607 
PreprocessIdsForISAMSiLookup()608 void CSeqDBGiList::PreprocessIdsForISAMSiLookup()
609 {
610 	NON_CONST_ITERATE(vector<CSeqDBGiList::SSiOid>, itr, m_SisOids) {
611 		string str_id = SeqDB_SimplifyAccession(itr->si);
612 		itr->si = NStr::ToLower(str_id);
613 	}
614 }
615 
InsureOrder(ESortOrder order)616 void CSeqDBGiList::InsureOrder(ESortOrder order)
617 {
618     // Code depends on OID order after translation, because various
619     // methods of SeqDB use this class for filtering purposes.
620 	static CFastMutex mtx;
621 	CFastMutexGuard mtx_gurad(mtx);
622     if ((order < m_CurrentOrder) || (order == eNone)) {
623         NCBI_THROW(CSeqDBException,
624                    eFileErr,
625                    "Out of sequence sort order requested.");
626     }
627 
628     // Input is usually sorted by GI, so we first test for sortedness.
629     // If it will fail it will probably do so almost immediately.
630 
631     if (order != m_CurrentOrder) {
632         switch(order) {
633         case eNone:
634             break;
635 
636         case eGi:
637             s_InsureOrder<CSeqDB_SortGiLessThan>(m_GisOids);
638             s_InsureOrder<CSeqDB_SortTiLessThan>(m_TisOids);
639             s_InsureOrder<CSeqDB_SortSiLessThan>(m_SisOids);
640             s_InsureOrder<CSeqDB_SortPigLessThan>(m_PigsOids);
641             break;
642 
643         default:
644             NCBI_THROW(CSeqDBException,
645                        eFileErr,
646                        "Unrecognized sort order requested.");
647         }
648 
649         m_CurrentOrder = order;
650     }
651 }
652 
653 
FindGi(TGi gi) const654 bool CSeqDBGiList::FindGi(TGi gi) const
655 {
656     int oid(0), index(0);
657     return (const_cast<CSeqDBGiList *>(this))->GiToOid(gi, oid, index);
658 }
659 
660 
GiToOid(TGi gi,int & oid)661 bool CSeqDBGiList::GiToOid(TGi gi, int & oid)
662 {
663     int index(0);
664     return GiToOid(gi, oid, index);
665 }
666 
667 
GiToOid(TGi gi,int & oid,int & index)668 bool CSeqDBGiList::GiToOid(TGi gi, int & oid, int & index)
669 {
670     InsureOrder(eGi);  // would assert be better?
671 
672     int b(0), e((int)m_GisOids.size());
673 
674     while(b < e) {
675         int m = (b + e)/2;
676         TGi m_gi = m_GisOids[m].gi;
677 
678         if (m_gi < gi) {
679             b = m + 1;
680         } else if (m_gi > gi) {
681             e = m;
682         } else {
683             oid = m_GisOids[m].oid;
684             index = m;
685             return true;
686         }
687     }
688 
689     oid = index = -1;
690     return false;
691 }
692 
693 
FindTi(TTi ti) const694 bool CSeqDBGiList::FindTi(TTi ti) const
695 {
696     int oid(0), index(0);
697     return (const_cast<CSeqDBGiList *>(this))->TiToOid(ti, oid, index);
698 }
699 
700 
TiToOid(TTi ti,int & oid)701 bool CSeqDBGiList::TiToOid(TTi ti, int & oid)
702 {
703     int index(0);
704     return TiToOid(ti, oid, index);
705 }
706 
707 
TiToOid(TTi ti,int & oid,int & index)708 bool CSeqDBGiList::TiToOid(TTi ti, int & oid, int & index)
709 {
710     InsureOrder(eGi);  // would assert be better?
711 
712     int b(0), e((int)m_TisOids.size());
713 
714     while(b < e) {
715         int m = (b + e)/2;
716         TTi m_ti = m_TisOids[m].ti;
717 
718         if (m_ti < ti) {
719             b = m + 1;
720         } else if (m_ti > ti) {
721             e = m;
722         } else {
723             oid = m_TisOids[m].oid;
724             index = m;
725             return true;
726         }
727     }
728 
729     oid = index = -1;
730     return false;
731 }
732 
FindSi(const string & si) const733 bool CSeqDBGiList::FindSi(const string &si) const
734 {
735     int oid(0), index(0);
736     return (const_cast<CSeqDBGiList *>(this))->SiToOid(si, oid, index);
737 }
738 
SiToOid(const string & si,int & oid)739 bool CSeqDBGiList::SiToOid(const string &si, int & oid)
740 {
741     int index(0);
742     return SiToOid(si, oid, index);
743 }
744 
SiToOid(const string & si,int & oid,int & index)745 bool CSeqDBGiList::SiToOid(const string &si, int & oid, int & index)
746 {
747     InsureOrder(eGi);
748 
749     int b(0), e((int)m_SisOids.size());
750 
751     while(b < e) {
752         int m = (b + e)/2;
753         const string & m_si = m_SisOids[m].si;
754 
755         if (m_si < si) {
756             b = m + 1;
757         } else if (si < m_si) {
758             e = m;
759         } else {
760             oid = m_SisOids[m].oid;
761             index = m;
762             return true;
763         }
764     }
765 
766     oid = index = -1;
767     return false;
768 }
769 
770 void
GetGiList(vector<TGi> & gis) const771 CSeqDBGiList::GetGiList(vector<TGi>& gis) const
772 {
773     gis.clear();
774     gis.reserve(GetNumGis());
775 
776     ITERATE(vector<SGiOid>, itr, m_GisOids) {
777         gis.push_back(itr->gi);
778     }
779 }
780 
781 void
GetPigList(vector<TPig> & pigs) const782 CSeqDBGiList::GetPigList(vector<TPig>& pigs) const
783 {
784     pigs.clear();
785     pigs.reserve(GetNumPigs());
786 
787     ITERATE(vector<SPigOid>, itr, m_PigsOids) {
788         pigs.push_back(itr->pig);
789     }
790 }
791 
792 void
GetTiList(vector<TTi> & tis) const793 CSeqDBGiList::GetTiList(vector<TTi>& tis) const
794 {
795     tis.clear();
796     tis.reserve(GetNumTis());
797 
798     ITERATE(vector<STiOid>, itr, m_TisOids) {
799         tis.push_back(itr->ti);
800     }
801 }
802 
803 
804 void
GetSiList(vector<string> & sis) const805 CSeqDBGiList::GetSiList(vector<string>& sis) const
806 {
807     sis.clear();
808     sis.reserve(GetNumSis());
809 
810     ITERATE(vector<SSiOid>, itr, m_SisOids) {
811         sis.push_back(itr->si);
812     }
813 }
814 
815 
816 
817 
818 
SeqDB_ReadBinaryGiList(const string & fname,vector<TGi> & gis)819 void SeqDB_ReadBinaryGiList(const string & fname, vector<TGi> & gis)
820 {
821     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
822 
823     Uint4 * beginp = (Uint4*) mfile.GetPtr();
824     Uint4 * endp   = (Uint4*) (((char*)mfile.GetPtr()) + mfile.GetSize());
825 
826     Int4 num_gis = (Int4) (endp - beginp) - 2;
827 
828     gis.clear();
829 
830     if (((endp - beginp) < 2U)
831             ||  (beginp[0] != 0xFFFFFFFFU)
832             ||  (SeqDB_GetStdOrd(beginp + 1) != (Uint4) num_gis)) {
833         NCBI_THROW(CSeqDBException,
834                    eFileErr,
835                    "Specified file is not a valid binary GI file.");
836     }
837 
838     gis.reserve(num_gis);
839 
840     for(Uint4 * elem = (beginp + 2); elem < endp; ++elem) {
841         gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
842     }
843 }
844 
845 /// This function determines whether a file is a valid binary GI/TI file.
846 /// @param fbeginp pointer to start of file [in]
847 /// @param fendp pointer to end of file [in]
848 /// @param has_long_ids will be set to true if the gi file contains long IDs [out]
849 /// @param has_tis will be set to true if the input file contains Trace IDs,
850 /// otherwise the file contains GIs [out]
851 /// @returns true if file is binary
852 /// @throws CSeqDBException if file is empty or invalid gi file
853 static
s_SeqDB_IsBinaryNumericList(const char * fbeginp,const char * fendp,bool & has_long_ids,bool * has_tis=NULL)854 bool s_SeqDB_IsBinaryNumericList(const char* fbeginp, const char* fendp,
855                                  bool& has_long_ids, bool* has_tis = NULL)
856 {
857     bool retval = false;
858     has_long_ids = false;
859     if (has_tis)
860         *has_tis = false;
861     Uint8 file_size = fendp - fbeginp;
862 
863     if (file_size == 0) {
864         NCBI_THROW(CSeqDBException,
865                    eFileErr,
866                    "Specified file is empty.");
867     } else if (isdigit((unsigned char)(*((char*) fbeginp))) ||
868                ((unsigned char)(*((char*) fbeginp)) == '#')) {
869         retval = false;
870     } else if ((file_size >= 8) && ((*fbeginp & 0xFF) == 0xFF)) {
871         retval = true;
872 
873         int marker = fbeginp[3] & 0xFF;
874 
875         if (marker == 0xFE || marker == 0xFC) {
876             has_long_ids = true;
877         }
878         if (has_tis && (marker == 0xFD || marker == 0xFC)) {
879             *has_tis = true;
880         }
881     } else {
882         NCBI_THROW(CSeqDBException,
883                    eFileErr,
884                    "Specified file is not a valid GI/TI list.");
885     }
886     return retval;
887 }
888 
s_ReadDigit(const char d,const string & list_type)889 int s_ReadDigit(const char d, const string & list_type)
890 {
891     switch(d) {
892 	    case '0':
893 	        return 0;
894 	    case '1':
895 	        return 1;
896 	    case '2':
897 	        return 2;
898 	    case '3':
899 	        return 3;
900 	    case '4':
901 	        return 4;
902 	    case '5':
903             return 5;
904         case '6':
905             return 6;
906         case '7':
907             return 7;
908         case '8':
909             return 8;
910         case '9':
911             return 9;
912         case ' ':
913         case '\n':
914         case '\r':
915     	    return -1;
916         default:
917         {
918              string msg = string("Invalid byte in text" +  list_type + " list [") +
919             		 	  NStr::UIntToString((unsigned char) d) + "].";
920              NCBI_THROW(CSeqDBException, eFileErr, msg);
921         }
922     }
923 }
924 
SeqDB_ReadMemoryGiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SGiOid> & gis,bool * in_order)925 void SeqDB_ReadMemoryGiList(const char * fbeginp,
926                             const char * fendp,
927                             vector<CSeqDBGiList::SGiOid> & gis,
928                             bool * in_order)
929 {
930     bool long_ids = false;
931     Uint8 file_size = fendp - fbeginp;
932 
933     if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
934         _ASSERT(long_ids == false);
935         Uint4* bbeginp = (Uint4*) fbeginp;
936         Uint4* bendp = (Uint4*) fendp;
937 
938         Uint8 num_gis = bendp - bbeginp - 2;
939 
940         gis.clear();
941 
942         if ((bbeginp[0] != 0xFFFFFFFFU)
943              ||  (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_gis)) {
944             NCBI_THROW(CSeqDBException,
945                        eFileErr,
946                        "Specified file is not a valid binary GI file.");
947         }
948 
949         gis.reserve(num_gis);
950 
951         if (in_order) {
952             TGi prev_gi = ZERO_GI;
953             bool in_gi_order = true;
954 
955             Uint4* elem = bbeginp + 2;
956             while(elem < bendp) {
957                 TGi this_gi = GI_FROM(Uint4, SeqDB_GetStdOrd(elem));
958                 gis.push_back(this_gi);
959 
960                 if (prev_gi > this_gi) {
961                     in_gi_order = false;
962                     break;
963                 }
964                 prev_gi = this_gi;
965                 elem++;
966             }
967 
968             while(elem < bendp) {
969                 gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem++)));
970             }
971 
972             *in_order = in_gi_order;
973         } else {
974             for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
975                 gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
976             }
977         }
978     } else {
979         _ASSERT(long_ids == false);
980         // We would prefer to do only one allocation, so assume
981         // average gi is 6 digits plus newline.  A few extra will be
982         // allocated, but this is preferable to letting the vector
983         // double itself (which it still will do if needed).
984 
985         gis.reserve((int) (file_size / 7));
986 
987         Uint4 elem(0);
988         const string list_type("GI");
989 
990         for(const char * p = fbeginp; p < fendp; p ++) {
991             int dig = s_ReadDigit(*p, list_type);
992             if (dig == -1) {
993                 if (elem != 0) {
994                     gis.push_back(GI_FROM(Uint4, elem));
995                 }
996                 elem = 0;
997                 continue;
998             }
999             elem *= 10;
1000             elem += dig;
1001         }
1002     }
1003 }
1004 
SeqDB_ReadMemoryPigList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SPigOid> & pigs,bool * in_order)1005 void SeqDB_ReadMemoryPigList(const char * fbeginp,
1006                              const char * fendp,
1007                              vector<CSeqDBGiList::SPigOid> & pigs,
1008                              bool * in_order)
1009 {
1010     bool long_ids = false;
1011     Int8 file_size = fendp - fbeginp;
1012 
1013     if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1014         Uint4* bbeginp = (Uint4*) fbeginp;
1015         Uint4* bendp = (Uint4*) fendp;
1016 
1017         Int4 num_pigs = (Int4) (bendp - bbeginp) - 2;
1018 
1019         pigs.clear();
1020 
1021         if (((bendp - bbeginp) < 2U)
1022                 ||  (bbeginp[0] != 0xFFFFFFFFU)
1023                 ||  (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_pigs)) {
1024             NCBI_THROW(CSeqDBException,
1025                        eFileErr,
1026                        "Specified file is not a valid binary IPG file.");
1027         }
1028 
1029         pigs.reserve(num_pigs);
1030 
1031         if (in_order) {
1032             TPig prev_pig = 0;
1033             bool sorted = true;
1034 
1035             Uint4* elem = bbeginp + 2;
1036             while(elem < bendp) {
1037                 TPig this_pig = SeqDB_GetStdOrd(elem);
1038                 pigs.push_back(this_pig);
1039 
1040                 if (prev_pig > this_pig) {
1041                     sorted = false;
1042                     break;
1043                 }
1044                 prev_pig = this_pig;
1045                 elem++;
1046             }
1047 
1048             while(elem < bendp) {
1049                 pigs.push_back(SeqDB_GetStdOrd(elem++));
1050             }
1051 
1052             *in_order = sorted;
1053         } else {
1054             for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1055                 pigs.push_back(SeqDB_GetStdOrd(elem));
1056             }
1057         }
1058     } else {
1059         pigs.reserve((int) (file_size / 7));
1060 
1061         Uint4 elem(0);
1062         const string list_type("IPG");
1063 
1064         for(const char * p = fbeginp; p < fendp; p ++) {
1065             int dig = s_ReadDigit(*p, list_type);
1066             if (dig == -1) {
1067                 // Skip blank lines or comments by ignoring zero.
1068                 if (elem != 0) {
1069                     pigs.push_back(elem);
1070                 }
1071                 elem = 0;
1072                 continue;
1073             }
1074             elem *= 10;
1075             elem += dig;
1076         }
1077     }
1078 }
1079 
SeqDB_ReadMemoryTaxIdList(const char * fbeginp,const char * fendp,CSeqDBGiList::STaxIdsOids & taxids)1080 void SeqDB_ReadMemoryTaxIdList(const char * fbeginp,
1081                                const char * fendp,
1082                                CSeqDBGiList::STaxIdsOids & taxids)
1083 {
1084     bool long_ids = false;
1085     if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1086         Int4* bbeginp = (Int4*) fbeginp;
1087         Int4* bendp = (Int4*) fendp;
1088 
1089         Uint8 num_taxids = (bendp - bbeginp) - 2;
1090 
1091         taxids.tax_ids.clear();
1092         taxids.oids.clear();
1093 
1094         if (((bendp - bbeginp) < 2) || (bbeginp[0] != 0xFFFFFFFF)
1095               || (SeqDB_GetStdOrd(bbeginp + 1) != (Int4) num_taxids)) {
1096             NCBI_THROW(CSeqDBException, eFileErr,
1097                        "Specified file is not a valid binary Tax Id List file.");
1098         }
1099 
1100         for(Int4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1101            taxids.tax_ids.insert(TAX_ID_FROM(Int4, SeqDB_GetStdOrd(elem)));
1102         }
1103     } else {
1104         Int4 elem(0);
1105         const string list_type("TAXID");
1106 
1107         for(const char * p = fbeginp; p < fendp; p ++) {
1108             int dig = s_ReadDigit(*p, list_type);
1109             if (dig == -1) {
1110                 // Skip blank lines or comments by ignoring zero.
1111                 if (elem != 0) {
1112                     taxids.tax_ids.insert(TAX_ID_FROM(Int4, elem));
1113                 }
1114                 elem = 0;
1115                 continue;
1116             }
1117             elem *= 10;
1118             elem += dig;
1119         }
1120     }
1121 }
1122 
1123 // [ NOTE: The 8 byte versions described here are not yet
1124 // implemented. ]
1125 //
1126 // FF..FF = -1 -> GI list <32 bit>
1127 // FF..FE = -2 -> GI list <64 bit>
1128 // FF..FD = -3 -> TI list <32 bit>
1129 // FF..FC = -4 -> TI list <64 bit>
1130 //
1131 // Format of the 8 byte TI list; note that we are still limited to
1132 // 2^32-1 TIs, which would involve an 32 GB identifier list file; this
1133 // code (in its current form) will not work at all on a 32 bit system
1134 // for GI files with more than about 500 megasequences, or TI files
1135 // with more than about 256 megasequences, assuming the current 16
1136 // bytes per vector element.  This is larger than the current total
1137 // number of GI sequences, but not larger than the number of TIs, so a
1138 // TI query for all TIs everywhere will most likely choke on a 32 bit
1139 // system because the data will simply not fit into memory (there are
1140 // nearly that many active TIs and the program will have other memory
1141 // expenditures.)
1142 //
1143 // 4 bytes: FF FF FF F?
1144 // 4 bytes: <number of TIs>
1145 // 8 bytes: TI#0
1146 // 8 bytes: TI#1
1147 // ...
1148 
SeqDB_ReadMemoryTiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::STiOid> & tis,bool * in_order)1149 void SeqDB_ReadMemoryTiList(const char * fbeginp,
1150                             const char * fendp,
1151                             vector<CSeqDBGiList::STiOid> & tis,
1152                             bool * in_order)
1153 {
1154     bool long_ids = false;
1155     Int8 file_size = fendp - fbeginp;
1156 
1157     if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1158         Int4 * bbeginp = (Int4*) fbeginp;
1159         Int4 * bendp = (Int4*) fendp;
1160         Int4 * bdatap = bbeginp + 2;
1161 
1162         Uint4 num_tis = (int)(bendp-bdatap);
1163 
1164         int remainder = num_tis % 2;
1165 
1166         if (long_ids) {
1167             num_tis /= 2;
1168         }
1169 
1170         tis.clear();
1171 
1172         bool bad_fmt = false;
1173 
1174         if (bendp < bdatap) {
1175             bad_fmt = true;
1176         } else {
1177             int marker = SeqDB_GetStdOrd(bbeginp);
1178             unsigned num_ids = SeqDB_GetStdOrd(bbeginp+1);
1179 
1180             if ((marker != -3 && marker != -4) ||
1181                 (num_ids != num_tis) ||
1182                 (remainder && long_ids)) {
1183 
1184                 bad_fmt = true;
1185             }
1186         }
1187 
1188         if (bad_fmt) {
1189             NCBI_THROW(CSeqDBException,
1190                        eFileErr,
1191                        "Specified file is not a valid binary GI or TI file.");
1192         }
1193 
1194         tis.reserve(num_tis);
1195 
1196         if (long_ids) {
1197             Int8 * bdatap8 = (Int8*) bdatap;
1198             Int8 * bendp8 = (Int8*) bendp;
1199 
1200             if (in_order) {
1201                 Int8 prev_ti =0;
1202                 bool in_ti_order = true;
1203 
1204                 Int8 * elem = bdatap8;
1205 
1206                 while(elem < bendp8) {
1207                     Int8 this_ti = (Int8) SeqDB_GetStdOrd(elem);
1208                     tis.push_back(this_ti);
1209 
1210                     if (prev_ti > this_ti) {
1211                         in_ti_order = false;
1212                         break;
1213                     }
1214                     prev_ti = this_ti;
1215                     elem ++;
1216                 }
1217 
1218                 while(elem < bendp8) {
1219                     tis.push_back((Int8) SeqDB_GetStdOrd(elem++));
1220                 }
1221 
1222                 *in_order = in_ti_order;
1223             } else {
1224                 for(Int8 * elem = bdatap8; elem < bendp8; elem ++) {
1225                     tis.push_back((Int8) SeqDB_GetStdOrd(elem));
1226                 }
1227             }
1228         } else {
1229             if (in_order) {
1230                 int prev_ti =0;
1231                 bool in_ti_order = true;
1232 
1233                 Int4 * elem = bdatap;
1234 
1235                 while(elem < bendp) {
1236                     int this_ti = (int) SeqDB_GetStdOrd(elem);
1237                     tis.push_back(this_ti);
1238 
1239                     if (prev_ti > this_ti) {
1240                         in_ti_order = false;
1241                         break;
1242                     }
1243                     prev_ti = this_ti;
1244                     elem ++;
1245                 }
1246 
1247                 while(elem < bendp) {
1248                     tis.push_back((int) SeqDB_GetStdOrd(elem++));
1249                 }
1250 
1251                 *in_order = in_ti_order;
1252             } else {
1253                 for(Int4 * elem = bdatap; elem < bendp; elem ++) {
1254                     tis.push_back((int) SeqDB_GetStdOrd(elem));
1255                 }
1256             }
1257         }
1258     } else {
1259         // We would prefer to do only one allocation, so assume
1260         // average gi is 6 digits plus newline.  A few extra will be
1261         // allocated, but this is preferable to letting the vector
1262         // double itself (which it still will do if needed).
1263 
1264         tis.reserve(int(file_size / 7));
1265 
1266         Int8 elem(0);
1267         const string list_type("TI");
1268 
1269         for(const char * p = fbeginp; p < fendp; p ++) {
1270             int dig = s_ReadDigit(*p, list_type);
1271             if (dig == -1) {
1272                 if (elem != 0) {
1273                     tis.push_back(elem);
1274                 }
1275                 elem = 0;
1276                 continue;
1277             }
1278             elem *= 10;
1279             elem += dig;
1280         }
1281     }
1282 }
1283 
SeqDB_ReadMemorySiList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1284 void SeqDB_ReadMemorySiList(const char * fbeginp,
1285                             const char * fendp,
1286                             vector<CSeqDBGiList::SSiOid> & sis,
1287                             bool * in_order)
1288 {
1289     Int8 file_size = fendp - fbeginp;
1290 
1291     // We would prefer to do only one allocation, so assume
1292     // average seqid is 6 digits plus newline.  A few extra will be
1293     // allocated, but this is preferable to letting the vector
1294     // double itself (which it still will do if needed).
1295 
1296     sis.reserve(sis.size() + int(file_size / 7));
1297 
1298     const char * p = fbeginp;
1299     const char * head;
1300     while ( p < fendp) {
1301         // find the head of the seqid
1302         while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1303         if (p< fendp && *p == '#') {
1304             // anything beyond this point in the line is a comment
1305             while (p< fendp && *p!='\n') ++p;
1306             continue;
1307         }
1308         head = p;
1309         while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1310         if (p > head) {
1311             string acc(head, p);
1312             string str_id = NStr::TruncateSpaces(acc, NStr::eTrunc_Both);
1313             if (str_id != "") {
1314                 sis.push_back(str_id);
1315             } else {
1316                 cerr << "WARNING:  " << acc
1317                      << " is not a valid seqid string." << endl;
1318             }
1319         }
1320     }
1321     if (in_order) *in_order = false;
1322 }
1323 
SeqDB_ReadMemoryMixList(const char * fbeginp,const char * fendp,vector<CSeqDBGiList::SGiOid> & gis,vector<CSeqDBGiList::STiOid> & tis,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1324 void SeqDB_ReadMemoryMixList(const char * fbeginp,
1325                             const char * fendp,
1326                             vector<CSeqDBGiList::SGiOid> & gis,
1327                             vector<CSeqDBGiList::STiOid> & tis,
1328                             vector<CSeqDBGiList::SSiOid> & sis,
1329                             bool * in_order)
1330 {
1331     Int8 file_size = fendp - fbeginp;
1332 
1333     // We would prefer to do only one allocation, so assume
1334     // average seqid is 6 digits plus newline.  A few extra will be
1335     // allocated, but this is preferable to letting the vector
1336     // double itself (which it still will do if needed).
1337 
1338     sis.reserve(sis.size() + int(file_size / 7));
1339 
1340     const char * p = fbeginp;
1341     const char * head;
1342     while ( p < fendp) {
1343         // find the head of the seqid
1344         while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1345         if (p< fendp && *p == '#') {
1346             // anything beyond this point in the line is a comment
1347             while (p< fendp && *p!='\n') ++p;
1348             continue;
1349         }
1350         head = p;
1351         while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1352         if (p > head) {
1353             string acc(head, p);
1354             string str_id;
1355             Int8 num_id;
1356             bool simpler;
1357             ESeqDBIdType id_type = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
1358             if (eStringId == id_type) {
1359                 sis.push_back(NStr::ToLower(str_id));
1360             }
1361             else if (eTiId == id_type) {
1362             	tis.push_back((TTi) num_id);
1363             }
1364             else if (eGiId == id_type) {
1365             	gis.push_back(GI_FROM(Int8, num_id));
1366             }
1367             else {
1368                 cerr << "WARNING:  " << acc
1369                      << " is not a valid seqid string." << endl;
1370             }
1371         }
1372     }
1373     if (in_order) *in_order = false;
1374 }
1375 
s_ContainsBinaryNumericIdList(const string & fname,CSeqDBFileGiList::EIdType type)1376 static bool s_ContainsBinaryNumericIdList(const string& fname, CSeqDBFileGiList::EIdType type)
1377 {
1378     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1379 
1380     Int8 file_size = mfile.GetSize();
1381     const char * fbeginp = (char*) mfile.GetPtr();
1382     const char * fendp   = fbeginp + (int)file_size;
1383 
1384     bool ignore = false;
1385     bool has_tis = false;
1386     bool retval = s_SeqDB_IsBinaryNumericList(fbeginp, fendp, ignore, &has_tis);
1387     if (type == CSeqDBFileGiList::eGiList) {
1388         return retval;
1389     } else {
1390         retval = has_tis && retval;
1391     }
1392     return retval;
1393 }
1394 
SeqDB_IsBinaryTiList(const string & fname)1395 bool SeqDB_IsBinaryTiList(const string  & fname)
1396 {
1397     return s_ContainsBinaryNumericIdList(fname, CSeqDBFileGiList::eTiList);
1398 }
1399 
SeqDB_IsBinaryGiList(const string & fname)1400 bool SeqDB_IsBinaryGiList(const string  & fname)
1401 {
1402     return s_ContainsBinaryNumericIdList(fname, CSeqDBFileGiList::eGiList);
1403 }
1404 
SeqDB_ReadGiList(const string & fname,vector<CSeqDBGiList::SGiOid> & gis,bool * in_order)1405 void SeqDB_ReadGiList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis, bool * in_order)
1406 {
1407     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1408 
1409     Int8 file_size = mfile.GetSize();
1410     const char * fbeginp = (char*) mfile.GetPtr();
1411     const char * fendp   = fbeginp + file_size;
1412 
1413     SeqDB_ReadMemoryGiList(fbeginp, fendp, gis, in_order);
1414 }
1415 
1416 
SeqDB_ReadTiList(const string & fname,vector<CSeqDBGiList::STiOid> & tis,bool * in_order)1417 void SeqDB_ReadTiList(const string & fname, vector<CSeqDBGiList::STiOid> & tis, bool * in_order)
1418 {
1419     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1420 
1421     Int8 file_size = mfile.GetSize();
1422     const char * fbeginp = (char*) mfile.GetPtr();
1423     const char * fendp   = fbeginp + file_size;
1424 
1425     SeqDB_ReadMemoryTiList(fbeginp, fendp, tis, in_order);
1426 }
1427 
SeqDB_ReadMixList(const string & fname,vector<CSeqDBGiList::SGiOid> & gis,vector<CSeqDBGiList::STiOid> & tis,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order)1428 void SeqDB_ReadMixList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis,
1429 		vector<CSeqDBGiList::STiOid> & tis, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order)
1430 {
1431     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1432 
1433     Int8 file_size = mfile.GetSize();
1434     const char *fbeginp = (char*) mfile.GetPtr();
1435     const char *fendp   = fbeginp + file_size;
1436 
1437     SeqDB_ReadMemoryMixList(fbeginp, fendp, gis, tis, sis, in_order);
1438 }
1439 
SeqDB_ReadPigList(const string & fname,vector<CSeqDBGiList::SPigOid> & pigs,bool * in_order)1440 void SeqDB_ReadPigList(const string & fname, vector<CSeqDBGiList::SPigOid> & pigs, bool * in_order)
1441 {
1442     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1443 
1444     Int8 file_size = mfile.GetSize();
1445     const char * fbeginp = (char*) mfile.GetPtr();
1446     const char * fendp   = fbeginp + file_size;
1447 
1448     SeqDB_ReadMemoryPigList(fbeginp, fendp, pigs, in_order);
1449 }
1450 
SeqDB_ReadTaxIdList(const string & fname,CSeqDBGiList::STaxIdsOids & taxids)1451 void SeqDB_ReadTaxIdList(const string & fname, CSeqDBGiList::STaxIdsOids & taxids)
1452 {
1453     CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1454 
1455     Int8 file_size = mfile.GetSize();
1456     const char * fbeginp = (char*) mfile.GetPtr();
1457     const char * fendp   = fbeginp + file_size;
1458 
1459     SeqDB_ReadMemoryTaxIdList(fbeginp, fendp, taxids);
1460 }
1461 
SeqDB_ReadGiList(const string & fname,vector<TGi> & gis,bool * in_order)1462 void SeqDB_ReadGiList(const string & fname, vector<TGi> & gis, bool * in_order)
1463 {
1464     typedef vector<CSeqDBGiList::SGiOid> TPairList;
1465 
1466     TPairList pairs;
1467     SeqDB_ReadGiList(fname, pairs, in_order);
1468 
1469     gis.reserve(pairs.size());
1470 
1471     ITERATE(TPairList, iter, pairs) {
1472         gis.push_back(iter->gi);
1473     }
1474 }
1475 
SeqDB_ReadSiList(const string & fname,vector<CSeqDBGiList::SSiOid> & sis,bool * in_order,SBlastSeqIdListInfo & db_info)1476 void SeqDB_ReadSiList(const string & fname, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order, SBlastSeqIdListInfo & db_info)
1477 {
1478 	CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1479     if (CBlastSeqidlistFile::GetSeqidlist(mfile, sis, db_info)) {
1480     	*in_order = true;
1481     	return;
1482     }
1483     else {
1484     	Int8 file_size = mfile.GetSize();
1485     	const char *fbeginp = (char*) mfile.GetPtr();
1486     	const char *fendp   = fbeginp + file_size;
1487     	SeqDB_ReadMemorySiList(fbeginp, fendp, sis, in_order);
1488     }
1489 }
1490 
FindGi(TGi gi)1491 bool CSeqDBNegativeList::FindGi(TGi gi)
1492 {
1493     InsureOrder();
1494     int b(0), e((int)m_Gis.size());
1495 
1496     while(b < e) {
1497         int m = (b + e)/2;
1498         TGi m_gi = m_Gis[m];
1499 
1500         if (m_gi < gi) {
1501             b = m + 1;
1502         } else if (m_gi > gi) {
1503             e = m;
1504         } else {
1505             return true;
1506         }
1507     }
1508 
1509     return false;
1510 }
1511 
1512 
FindTi(TTi ti)1513 bool CSeqDBNegativeList::FindTi(TTi ti)
1514 {
1515     InsureOrder();
1516 
1517     int b(0), e((int)m_Tis.size());
1518 
1519     while(b < e) {
1520         int m = (b + e)/2;
1521         TTi m_ti = m_Tis[m];
1522 
1523         if (m_ti < ti) {
1524             b = m + 1;
1525         } else if (m_ti > ti) {
1526             e = m;
1527         } else {
1528             return true;
1529         }
1530     }
1531 
1532     return false;
1533 }
1534 
FindId(const CSeq_id & id)1535 bool CSeqDBNegativeList::FindId(const CSeq_id & id)
1536 {
1537     bool match_type = false;
1538     return FindId(id, match_type);
1539 }
1540 
1541 
FindSi(string si)1542 bool CSeqDBNegativeList::FindSi(string si)
1543 {
1544     InsureOrder();
1545     int b(0), e((int)m_Sis.size());
1546 
1547     while(b < e) {
1548         int m = (b + e)/2;
1549         string m_si = m_Sis[m];
1550 
1551         if (m_si < si) {
1552             b = m + 1;
1553         } else if (m_si > si) {
1554             e = m;
1555         } else {
1556             return true;
1557         }
1558     }
1559 
1560     return false;
1561 }
1562 
1563 
FindId(const CSeq_id & id,bool & match_type)1564 bool CSeqDBNegativeList::FindId(const CSeq_id & id, bool & match_type)
1565 {
1566     if (id.IsGi()) {
1567         match_type = (GetNumGis() > 0) ? true : false;
1568         if(match_type) {
1569             return(FindGi(id.GetGi()));
1570         }
1571     } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1572         match_type = (GetNumTis() > 0) ? true : false;
1573 
1574         if(match_type) {
1575             const CObject_id & obj = id.GetGeneral().GetTag();
1576 
1577             Int8 ti = (obj.IsId()
1578                    ? obj.GetId()
1579                    : NStr::StringToInt8(obj.GetStr()));
1580 
1581             return FindTi(ti);
1582         }
1583     } else {
1584         match_type = (GetNumSis() > 0) ? true : false;
1585 
1586         if(match_type) {
1587         	if(FindSi(GetBlastSeqIdString(id, true))) return true;
1588         	if(FindSi(GetBlastSeqIdString(id, false))) return true;
1589 
1590         	// For isam lookup
1591             Int8 num_id;
1592             string str_id;
1593             bool simpler;
1594 
1595             SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1596 
1597             if (FindSi(str_id)) {
1598                 return true;
1599             }
1600 
1601             // We may have to strip the version to find it...
1602             size_t pos = str_id.find(".");
1603             if (pos != str_id.npos) {
1604                 string nover(str_id, 0, pos);
1605                 return FindSi(nover);
1606             }
1607         }
1608     }
1609     return false;
1610 }
1611 
PreprocessIdsForISAMSiLookup()1612 void CSeqDBNegativeList::PreprocessIdsForISAMSiLookup()
1613 {
1614 	NON_CONST_ITERATE(vector<string>, itr, m_Sis) {
1615 		string str_id = SeqDB_SimplifyAccession(*itr);
1616 		*itr = NStr::ToLower(str_id);
1617 	}
1618 }
1619 
InsureOrder()1620 void CSeqDBNegativeList::InsureOrder()
1621 {
1622    	static CFastMutex mtx;
1623    	CFastMutexGuard mtx_gurad(mtx);
1624     if (m_LastSortSize != (m_Gis.size() + m_Tis.size() +m_Sis.size())) {
1625         std::sort(m_Gis.begin(), m_Gis.end());
1626         std::sort(m_Tis.begin(), m_Tis.end());
1627         std::sort(m_Sis.begin(), m_Sis.end());
1628 
1629         m_LastSortSize = m_Gis.size() + m_Tis.size() + m_Sis.size();
1630     }
1631 }
1632 
FindId(const CSeq_id & id)1633 bool CSeqDBGiList::FindId(const CSeq_id & id)
1634 {
1635     if (id.IsGi()) {
1636         return FindGi(id.GetGi());
1637     } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1638         const CObject_id & obj = id.GetGeneral().GetTag();
1639 
1640         TTi ti = (obj.IsId()
1641                    ? (TTi) obj.GetId()
1642                    : (TTi) NStr::StringToInt8(obj.GetStr()));
1643 
1644         return FindTi(ti);
1645     } else {
1646     	if(FindSi(GetBlastSeqIdString(id, true))) return true;
1647     	if(FindSi(GetBlastSeqIdString(id, false))) return true;
1648 
1649     	/// For isam lookup
1650         Int8 num_id;
1651         string str_id;
1652         bool simpler;
1653         SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1654         if (FindSi(str_id)) return true;
1655 
1656         // We may have to strip the version to find it...
1657         size_t pos = str_id.find(".");
1658         if (pos != str_id.npos) {
1659             string nover(str_id, 0, pos);
1660             return FindSi(nover);
1661         }
1662     }
1663     return false;
1664 }
1665 
1666 
CSeqDBFileGiList(const string & fname,EIdType idtype)1667 CSeqDBFileGiList::CSeqDBFileGiList(const string & fname, EIdType idtype)
1668 {
1669     bool in_order = false;
1670     switch(idtype) {
1671         case eGiList:
1672             SeqDB_ReadGiList(fname, m_GisOids, & in_order);
1673             break;
1674         case eTiList:
1675             SeqDB_ReadTiList(fname, m_TisOids, & in_order);
1676             break;
1677         case eSiList:
1678             SeqDB_ReadSiList(fname, m_SisOids, & in_order, m_ListInfo);
1679             break;
1680         case eMixList:
1681         	SeqDB_ReadMixList(fname, m_GisOids, m_TisOids, m_SisOids, & in_order);
1682         	break;
1683         case ePigList:
1684         	SeqDB_ReadPigList(fname, m_PigsOids, & in_order);
1685         	break;
1686         case eTaxIdList:
1687         	SeqDB_ReadTaxIdList(fname, m_TaxIdsOids);
1688         	in_order = true;
1689         	break;
1690     }
1691     m_CurrentOrder = in_order ? eGi : eNone;
1692 }
1693 /*
1694 CSeqDBFileGiList::CSeqDBFileGiList(vector<string> fnames, EIdType idtype)
1695 {
1696     bool in_order = false;
1697     switch(idtype) {
1698         case eGiList:
1699         case eTiList:
1700             NCBI_THROW(CSeqDBException,
1701                    eArgErr,
1702                    "Only multiple seqid list is supported.");
1703         case eSiList:
1704             ITERATE(vector<string>, iter, fnames) {
1705                 SeqDB_ReadSiList(*iter, m_SisOids, & in_order);
1706             }
1707             break;
1708         case eMixList:
1709             ITERATE(vector<string>, iter, fnames) {
1710             	SeqDB_ReadMixList(*iter, m_GisOids, m_TisOids, m_SisOids, & in_order);
1711             }
1712             break;
1713     }
1714     m_CurrentOrder = in_order ? eGi : eNone;
1715 }
1716 */
SeqDB_CombineAndQuote(const vector<string> & dbs,string & dbname)1717 void SeqDB_CombineAndQuote(const vector<string> & dbs,
1718                            string               & dbname)
1719 {
1720     int sz = 0;
1721 
1722     for(unsigned i = 0; i < dbs.size(); i++) {
1723         sz += int(3 + dbs[i].size());
1724     }
1725 
1726     dbname.reserve(sz);
1727 
1728     for(unsigned i = 0; i < dbs.size(); i++) {
1729         if (dbname.size()) {
1730             dbname.append(" ");
1731         }
1732 
1733         if (dbs[i].find(" ") != string::npos) {
1734             dbname.append("\"");
1735             dbname.append(dbs[i]);
1736             dbname.append("\"");
1737         } else {
1738             dbname.append(dbs[i]);
1739         }
1740     }
1741 }
1742 
1743 
SeqDB_SplitQuoted(const string & dbname,vector<CTempString> & dbs,bool keep_quote)1744 void SeqDB_SplitQuoted(const string        & dbname,
1745                        vector<CTempString> & dbs,
1746                        bool keep_quote)
1747 {
1748     vector<CSeqDB_Substring> subs;
1749 
1750     SeqDB_SplitQuoted(dbname, subs, keep_quote);
1751 
1752     dbs.resize(0);
1753     dbs.reserve(subs.size());
1754 
1755     ITERATE(vector<CSeqDB_Substring>, iter, subs) {
1756         CTempString tmp(iter->GetBegin(), iter->Size());
1757         dbs.push_back(tmp);
1758     }
1759 }
1760 
1761 
SeqDB_SplitQuoted(const string & dbname,vector<CSeqDB_Substring> & dbs,bool keep_quote)1762 void SeqDB_SplitQuoted(const string             & dbname,
1763                        vector<CSeqDB_Substring> & dbs,
1764                        bool keep_quote)
1765 {
1766     // split names
1767 
1768     const char * sp = dbname.data();
1769 
1770     bool quoted = false;
1771     unsigned begin = 0;
1772 
1773     for(unsigned i = 0; i < dbname.size(); i++) {
1774         char ch = dbname[i];
1775 
1776         if (quoted) {
1777             // Quoted mode sees '"' as the only actionable token.
1778             if (ch == '"') {
1779                 if (begin < i) {
1780                 	if(keep_quote) i++;
1781                     dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1782                 }
1783                 begin = i + 1;
1784                 quoted = false;
1785             }
1786         } else {
1787             // Non-quote mode: Space or quote starts the next string.
1788 
1789             if (ch == ' ') {
1790                 if (begin < i) {
1791                     dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1792                 }
1793                 begin = i + 1;
1794             } else if (ch == '"') {
1795                 if (begin < i) {
1796                     dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1797                 }
1798                 begin = keep_quote ? i : i + 1;
1799                 quoted = true;
1800             }
1801         }
1802     }
1803 
1804     if (begin < dbname.size()) {
1805         dbs.push_back(CSeqDB_Substring(sp + begin, sp + dbname.size()));
1806     }
1807 }
1808 
1809 
CIntersectionGiList(CSeqDBGiList & gilist,vector<TGi> & gis)1810 CIntersectionGiList::CIntersectionGiList(CSeqDBGiList & gilist, vector<TGi> & gis)
1811 {
1812     _ASSERT(this != & gilist);
1813 
1814     gilist.InsureOrder(CSeqDBGiList::eGi);
1815     sort(gis.begin(), gis.end());
1816 
1817     int list_i = 0;
1818     int list_n = gilist.GetNumGis();
1819     int gis_i = 0;
1820     int gis_n = (int) gis.size();
1821 
1822     while(list_i < list_n && gis_i < gis_n) {
1823         TGi L = gilist.GetGiOid(list_i).gi;
1824         TGi G = gis[gis_i];
1825 
1826         if (L < G) {
1827             list_i ++;
1828             continue;
1829         }
1830 
1831         if (L > G) {
1832             gis_i ++;
1833             continue;
1834         }
1835 
1836         m_GisOids.push_back(gilist.GetGiOid(list_i));
1837 
1838         list_i++;
1839         gis_i++;
1840     }
1841 
1842     m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1843 }
1844 
1845 
CIntersectionGiList(CSeqDBNegativeList & neg_gilist,vector<TGi> & gis)1846 CIntersectionGiList::CIntersectionGiList(CSeqDBNegativeList & neg_gilist, vector<TGi> & gis)
1847 {
1848     neg_gilist.InsureOrder();
1849     sort(gis.begin(), gis.end());
1850 
1851     int list_i = 0;
1852     int list_n = neg_gilist.GetNumGis();
1853     int gis_i = 0;
1854     int gis_n = (int) gis.size();
1855 
1856     while(list_i < list_n && gis_i < gis_n) {
1857         TGi L = neg_gilist.GetGi(list_i);
1858         TGi G = gis[gis_i];
1859 
1860         if (L < G) {
1861             list_i ++;
1862             continue;
1863         }
1864 
1865         if (L > G) {
1866             m_GisOids.push_back(gis[gis_i]);
1867             gis_i ++;
1868             continue;
1869         }
1870 
1871         list_i++;
1872 
1873         TGi last_gi = gis[gis_i];
1874         do { gis_i++; } while (gis_i < gis_n && gis[gis_i] == last_gi);
1875     }
1876 
1877     // push all the remaining vector gi's if any left
1878     while (gis_i < gis_n) {
1879         m_GisOids.push_back(gis[gis_i++]);
1880     }
1881 
1882     m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1883 }
1884 
1885 
CSeqDBIdSet(const vector<Int4> & ids,EIdType t,bool positive)1886 CSeqDBIdSet::CSeqDBIdSet(const vector<Int4> & ids, EIdType t, bool positive)
1887     : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1888 {
1889     x_SortAndUnique(m_Ids->Set());
1890 }
1891 
CSeqDBIdSet(const vector<Int8> & ids,EIdType t,bool positive)1892 CSeqDBIdSet::CSeqDBIdSet(const vector<Int8> & ids, EIdType t, bool positive)
1893     : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1894 {
1895     x_SortAndUnique(m_Ids->Set());
1896 }
1897 
CSeqDBIdSet(const vector<Uint8> & ids,EIdType t,bool positive)1898 CSeqDBIdSet::CSeqDBIdSet(const vector<Uint8> & ids, EIdType t, bool positive)
1899     : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1900 {
1901     x_SortAndUnique(m_Ids->Set());
1902 }
1903 
1904 #ifdef NCBI_STRICT_GI
CSeqDBIdSet(const vector<TGi> & ids,EIdType t,bool positive)1905 CSeqDBIdSet::CSeqDBIdSet(const vector<TGi> & ids, EIdType t, bool positive)
1906     : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1907 {
1908     x_SortAndUnique(m_Ids->Set());
1909 }
1910 #endif
1911 
CSeqDBIdSet(const vector<string> & ids,EIdType t,bool positive)1912 CSeqDBIdSet::CSeqDBIdSet(const vector<string> & ids, EIdType t, bool positive)
1913     : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1914 {
1915     x_SortAndUnique(m_Ids->SetSeqIDs());
1916 }
1917 
x_SortAndUnique(vector<Int8> & ids)1918 void CSeqDBIdSet::x_SortAndUnique(vector<Int8> & ids)
1919 {
1920     sort(ids.begin(), ids.end());
1921     ids.erase(unique(ids.begin(), ids.end()), ids.end());
1922 }
1923 
1924 
x_SortAndUnique(vector<string> & ids)1925 void CSeqDBIdSet::x_SortAndUnique(vector<string> & ids)
1926 {
1927     sort(ids.begin(), ids.end());
1928     ids.erase(unique(ids.begin(), ids.end()), ids.end());
1929 }
1930 
Negate()1931 void CSeqDBIdSet::Negate()
1932 {
1933     m_Positive = ! m_Positive;
1934 }
1935 
1936 void CSeqDBIdSet::
x_SummarizeBooleanOp(EOperation op,bool A_pos,bool B_pos,bool & result_pos,bool & incl_A,bool & incl_B,bool & incl_AB)1937 x_SummarizeBooleanOp(EOperation op,
1938                      bool       A_pos,
1939                      bool       B_pos,
1940                      bool     & result_pos,
1941                      bool     & incl_A,
1942                      bool     & incl_B,
1943                      bool     & incl_AB)
1944 {
1945     typedef CSeqDBIdSet TIdList;
1946 
1947     incl_A = incl_B = incl_AB = false;
1948 
1949     // Each binary boolean function can be represented as a 4 bit
1950     // descriptor.  The four bits indicate whether the result is true
1951     // when it appears, respectively, in neither list, only the second
1952     // list, only the first list, or in both lists.  For example, the
1953     // operation (A AND B) can be represented as (0001), and (A OR !B)
1954     // can be written as (1011).  In a positive ID list, 1 means that
1955     // an ID should be included in database iteration.
1956 
1957     // But 4-bit descriptors starting with a '1' correspond to logical
1958     // operations that include all IDs not appearing in either input
1959     // set.  But of course we do not have access to the IDs that do
1960     // not appear, so we cannot (feasibly) compute such operations.
1961 
1962     // To solve this problem, De Morgan's Laws are used to transform
1963     // the operation into its inverse, the results of which can be
1964     // applied to SeqDB as a negative ID list.
1965 
1966     // For our purposes, these three transforms are needed:
1967     //
1968     //  1. (!X and !Y) becomes !(X or Y)
1969     //  2. (!X or Y) becomes !(X and !Y)
1970     //  3. (X or !Y) becomes !(!X and Y)
1971 
1972     result_pos = true;
1973 
1974     switch(op) {
1975     case eAnd:
1976         if ((! A_pos) && (! B_pos)) {
1977             op = TIdList::eOr;
1978             result_pos = false;
1979             A_pos = B_pos = true;
1980         }
1981         break;
1982 
1983     case eOr:
1984         if ((! A_pos) || (! B_pos)) {
1985             op = TIdList::eAnd;
1986             result_pos = false;
1987             A_pos = ! A_pos;
1988             B_pos = ! B_pos;
1989         }
1990         break;
1991 
1992     case eXor:
1993         result_pos = A_pos == B_pos;
1994         break;
1995 
1996     default:
1997         break;
1998     }
1999 
2000     // Once we have a legal operation, we construct these flags to
2001     // summarize the boolean operation.  (Each of these corresponds to
2002     // one of the bits in the 4-bit descriptor.)
2003 
2004     switch(op) {
2005     case eAnd:
2006         _ASSERT(A_pos || B_pos);
2007         incl_A = !B_pos;
2008         incl_B = !A_pos;
2009         incl_AB = A_pos && B_pos;
2010         break;
2011 
2012     case eOr:
2013         _ASSERT(A_pos || B_pos);
2014         incl_A = incl_B = incl_AB = true;
2015         break;
2016 
2017     case eXor:
2018         incl_AB = (A_pos != B_pos);
2019         incl_A = incl_B = ! incl_AB;
2020         break;
2021 
2022     default:
2023         break;
2024     }
2025 }
2026 
2027 void CSeqDBIdSet::
x_BooleanSetOperation(EOperation op,const vector<Int8> & A,bool A_pos,const vector<Int8> & B,bool B_pos,vector<Int8> & result,bool & result_pos)2028 x_BooleanSetOperation(EOperation           op,
2029                       const vector<Int8> & A,
2030                       bool                 A_pos,
2031                       const vector<Int8> & B,
2032                       bool                 B_pos,
2033                       vector<Int8>       & result,
2034                       bool               & result_pos)
2035 {
2036     bool incl_A(false),
2037         incl_B(false),
2038         incl_AB(false);
2039 
2040     x_SummarizeBooleanOp(op,
2041                          A_pos,
2042                          B_pos,
2043                          result_pos,
2044                          incl_A,
2045                          incl_B,
2046                          incl_AB);
2047 
2048     size_t A_i(0), B_i(0);
2049 
2050     while((A_i < A.size()) && (B_i < B.size())) {
2051         Int8 Ax(A[A_i]), Bx(B[B_i]), target(-1);
2052         bool included(false);
2053 
2054         if (Ax < Bx) {
2055             ++ A_i;
2056             target = Ax;
2057             included = incl_A;
2058         } else if (Ax > Bx) {
2059             ++ B_i;
2060             target = Bx;
2061             included = incl_B;
2062         } else {
2063             ++ A_i;
2064             ++ B_i;
2065             target = Ax;
2066             included = incl_AB;
2067         }
2068 
2069         if (included) {
2070             result.push_back(target);
2071         }
2072     }
2073 
2074     if (incl_A) {
2075         while(A_i < A.size()) {
2076             result.push_back(A[A_i++]);
2077         }
2078     }
2079 
2080     if (incl_B) {
2081         while(B_i < B.size()) {
2082             result.push_back(B[B_i++]);
2083         }
2084     }
2085 }
2086 
Compute(EOperation op,const vector<Int4> & ids,bool positive)2087 void CSeqDBIdSet::Compute(EOperation          op,
2088                           const vector<Int4> & ids,
2089                           bool                positive)
2090 {
2091     CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2092 
2093     CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2094 
2095     x_SortAndUnique(B->Set());
2096 
2097     bool result_pos(true);
2098 
2099     x_BooleanSetOperation(op,
2100                           m_Ids->Set(),
2101                           m_Positive,
2102                           B->Set(),
2103                           positive,
2104                           result->Set(),
2105                           result_pos);
2106 
2107     m_Positive = result_pos;
2108     m_Ids = result;
2109 }
2110 
Compute(EOperation op,const vector<Int8> & ids,bool positive)2111 void CSeqDBIdSet::Compute(EOperation           op,
2112                           const vector<Int8> & ids,
2113                           bool                 positive)
2114 {
2115     CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2116 
2117     CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2118     x_SortAndUnique(B->Set());
2119 
2120     bool result_pos(true);
2121 
2122     x_BooleanSetOperation(op,
2123                           m_Ids->Set(),
2124                           m_Positive,
2125                           B->Set(),
2126                           positive,
2127                           result->Set(),
2128                           result_pos);
2129 
2130     m_Positive = result_pos;
2131     m_Ids = result;
2132 }
2133 
Compute(EOperation op,const vector<Uint8> & ids,bool positive)2134 void CSeqDBIdSet::Compute(EOperation           op,
2135                           const vector<Uint8> & ids,
2136                           bool                 positive)
2137 {
2138     CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2139 
2140     CRef<CSeqDBIdSet_Vector> B(new CSeqDBIdSet_Vector(ids));
2141     x_SortAndUnique(B->Set());
2142 
2143     bool result_pos(true);
2144 
2145     x_BooleanSetOperation(op,
2146                           m_Ids->Set(),
2147                           m_Positive,
2148                           B->Set(),
2149                           positive,
2150                           result->Set(),
2151                           result_pos);
2152 
2153     m_Positive = result_pos;
2154     m_Ids = result;
2155 }
2156 
Compute(EOperation op,const CSeqDBIdSet & ids)2157 void CSeqDBIdSet::Compute(EOperation op, const CSeqDBIdSet & ids)
2158 {
2159     if (m_IdType != ids.m_IdType ) {
2160         NCBI_THROW(CSeqDBException,
2161                    eArgErr,
2162                    "Set operation requested but ID types don't match.");
2163     }
2164 
2165     CRef<CSeqDBIdSet_Vector> result(new CSeqDBIdSet_Vector);
2166     bool result_pos(true);
2167 
2168     x_BooleanSetOperation(op,
2169                           m_Ids->Set(),
2170                           m_Positive,
2171                           ids.m_Ids->Get(),
2172                           ids.m_Positive,
2173                           result->Set(),
2174                           result_pos);
2175 
2176     m_Positive = result_pos;
2177     m_Ids = result;
2178 }
2179 
GetPositiveList()2180 CRef<CSeqDBGiList> CSeqDBIdSet::GetPositiveList()
2181 {
2182     CRef<CSeqDBGiList> ids(new CSeqDBGiList);
2183 
2184     if (! m_Positive) {
2185         NCBI_THROW(CSeqDBException,
2186                    eFileErr,
2187                    "Positive ID list requested but only negative exists.");
2188     }
2189 
2190     if (m_IdType == eTi) {
2191         ids->ReserveTis(m_Ids->Size());
2192 
2193         ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2194             ids->AddTi(*iter);
2195         }
2196     } else {
2197         ids->ReserveGis(m_Ids->Size());
2198 
2199         ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2200             _ASSERT(((*iter) >> 32) == 0);
2201             ids->AddGi(GI_FROM(Int8, *iter));
2202         }
2203     }
2204 
2205     return ids;
2206 }
2207 
GetNegativeList()2208 CRef<CSeqDBNegativeList> CSeqDBIdSet::GetNegativeList()
2209 {
2210     if (m_Positive) {
2211         NCBI_THROW(CSeqDBException,
2212                    eFileErr,
2213                    "Negative ID list requested but only positive exists.");
2214     }
2215 
2216     CRef<CSeqDBNegativeList> ids(new CSeqDBNegativeList);
2217 
2218     if (m_IdType == eTi) {
2219         ids->ReserveTis(m_Ids->Size());
2220 
2221         ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2222             ids->AddTi(*iter);
2223         }
2224     } else if (m_IdType == eGi) {
2225         ids->ReserveGis(m_Ids->Size());
2226 
2227         ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2228             _ASSERT(((*iter) >> 32) == 0);
2229             ids->AddGi(GI_FROM(Int8, *iter));
2230         }
2231     }
2232     else {
2233         ids->ReserveSis(m_Ids->Size());
2234 
2235         ITERATE(vector<string>, iter, m_Ids->SetSeqIDs()) {
2236             ids->AddSi(*iter);
2237         }
2238     }
2239 
2240     return ids;
2241 }
2242 
CSeqDBIdSet()2243 CSeqDBIdSet::CSeqDBIdSet()
2244     : m_Positive (false),
2245       m_IdType   (eGi),
2246       m_Ids      (new CSeqDBIdSet_Vector)
2247 {
2248 }
2249 
Blank() const2250 bool CSeqDBIdSet::Blank() const
2251 {
2252     return (! m_Positive) && (0 == m_Ids->Size());
2253 }
2254 
SeqDB_FileIntegrityAssert(const string & file,int line,const string & text)2255 void SeqDB_FileIntegrityAssert(const string & file,
2256                                int            line,
2257                                const string & text)
2258 {
2259     string msg = "Validation failed: [" + text + "] at ";
2260     msg += file + ":" + NStr::IntToString(line);
2261     SeqDB_ThrowException(CSeqDBException::eFileErr, msg);
2262 }
2263 
SeqDB_SimplifySeqid(CSeq_id & bestid,const string * acc,Int8 & num_id,string & str_id,bool & simpler)2264 ESeqDBIdType SeqDB_SimplifySeqid(CSeq_id       & bestid,
2265                                  const string  * acc,
2266                                  Int8          & num_id,
2267                                  string        & str_id,
2268                                  bool          & simpler)
2269 {
2270     ESeqDBIdType result = eStringId;
2271 
2272     const CTextseq_id * tsip = 0;
2273 
2274     bool matched = true;
2275 
2276     switch(bestid.Which()) {
2277     case CSeq_id::e_Gi:
2278         simpler = true;
2279         num_id = GI_TO(Int8, bestid.GetGi());
2280         result = eGiId;
2281         break;
2282 
2283     case CSeq_id::e_Gibbsq:    /* gibbseq */
2284         simpler = true;
2285         result = eStringId;
2286         str_id = NStr::UIntToString(bestid.GetGibbsq());
2287         break;
2288 
2289     case CSeq_id::e_General:
2290         {
2291             const CDbtag & dbt = bestid.GetGeneral();
2292 
2293             if (dbt.CanGetDb()) {
2294                 if (dbt.GetDb() == "BL_ORD_ID") {
2295                     simpler = true;
2296                     num_id = dbt.GetTag().GetId();
2297                     result = eOID;
2298                     break;
2299                 }
2300 
2301                 if (dbt.GetDb() == "PIG") {
2302                     simpler = true;
2303                     num_id = dbt.GetTag().GetId();
2304                     result = ePigId;
2305                     break;
2306                 }
2307 
2308                 if (dbt.GetDb() == "ti") {
2309                     simpler = true;
2310                     num_id = (dbt.GetTag().IsStr()
2311                               ? NStr::StringToInt8(dbt.GetTag().GetStr())
2312                               : dbt.GetTag().GetId());
2313 
2314                     result = eTiId;
2315                     break;
2316                 }
2317 
2318 
2319                 if (NStr::CompareNocase(dbt.GetDb(), "GNOMON") == 0) {
2320                     str_id = bestid.AsFastaString();
2321                     str_id = NStr::ToLower(str_id);
2322                     result = eStringId;
2323                     break;
2324                 }
2325             }
2326 
2327             if (dbt.CanGetTag() && dbt.GetTag().IsStr()) {
2328                 result = eStringId;
2329                 str_id = dbt.GetTag().GetStr();
2330                 str_id = NStr::ToLower(str_id);
2331             } else {
2332                 // Use the default logic.
2333                 matched = false;
2334             }
2335         }
2336         break;
2337 
2338     case CSeq_id::e_Local:     /* local */
2339         simpler = true;
2340         result = eStringId;
2341         {
2342             const CObject_id & objid = bestid.GetLocal();
2343 
2344             if (objid.IsStr()) {
2345                 // sparse version will leave "lcl|" off.
2346                 str_id = objid.GetStr();
2347                 str_id = NStr::ToLower(str_id);
2348             } else {
2349                 // Local numeric ids are stored as strings.
2350                 str_id = "lcl|" + NStr::IntToString(objid.GetId());
2351             }
2352         }
2353         break;
2354 
2355         // tsip types
2356 
2357     case CSeq_id::e_Embl:      /* embl */
2358     case CSeq_id::e_Ddbj:      /* ddbj */
2359     case CSeq_id::e_Genbank:   /* genbank */
2360     case CSeq_id::e_Tpg:       /* Third Party Annot/Seq Genbank */
2361     case CSeq_id::e_Tpe:       /* Third Party Annot/Seq EMBL */
2362     case CSeq_id::e_Tpd:       /* Third Party Annot/Seq DDBJ */
2363     case CSeq_id::e_Other:     /* other */
2364     case CSeq_id::e_Swissprot: /* swissprot (now with versions) */
2365     case CSeq_id::e_Gpipe:     /* internal NCBI genome pipeline */
2366         tsip = bestid.GetTextseq_Id();
2367         break;
2368 
2369     case CSeq_id::e_Pir:       /* pir   */
2370     case CSeq_id::e_Prf:       /* prf   */
2371         tsip = bestid.GetTextseq_Id();
2372         break;
2373 
2374     default:
2375         matched = false;
2376     }
2377 
2378     // Default: if we have a string, use it; if we only have seqid,
2379     // create a string.  This should not happen if the seqid matches
2380     // one of the cases above, which currently correspond to all the
2381     // supported seqid types.
2382 
2383     CSeq_id::ELabelFlags label_flags = (CSeq_id::ELabelFlags)
2384         (CSeq_id::fLabel_GeneralDbIsContent | CSeq_id::fLabel_Version);
2385 
2386     if (! matched) {
2387         // (should not happen normally)
2388 
2389         simpler = false;
2390         result  = eStringId;
2391 
2392         if (acc) {
2393             str_id = *acc;
2394             str_id = NStr::ToLower(str_id);
2395         } else {
2396             bestid.GetLabel(& str_id, CSeq_id::eFasta, label_flags);
2397             str_id = NStr::ToLower(str_id);
2398         }
2399     }
2400 
2401     if (tsip) {
2402         bool found = false;
2403 
2404         if (tsip->CanGetAccession()) {
2405             str_id = tsip->GetAccession();
2406             str_id = NStr::ToLower(str_id);
2407             found = true;
2408 
2409             if (tsip->CanGetVersion()) {
2410                 str_id += ".";
2411                 str_id += NStr::UIntToString(tsip->GetVersion());
2412             }
2413         } else if (tsip->CanGetName()) {
2414             str_id = tsip->GetName();
2415             str_id = NStr::ToLower(str_id);
2416             found = true;
2417         }
2418 
2419         if (found) {
2420             simpler = true;
2421             result = eStringId;
2422         }
2423     }
2424 
2425     return result;
2426 }
2427 
2428 /// Find the end of a single element in a Seq-id set
2429 ///
2430 /// Seq-id strings sometimes contain several Seq-ids.  This function
2431 /// looks for the end of the first Seq-id, and will return its length.
2432 /// Static methods of CSeq_id are used to evaluate tokens.
2433 ///
2434 /// @param str
2435 ///   Seq-id string to search.
2436 /// @param pos
2437 ///   Position at which to start search.
2438 /// @return
2439 ///   End position of first fasta id, or string::npos in case of error.
2440 
2441 static size_t
s_SeqDB_EndOfFastaID(const string & str,size_t pos)2442 s_SeqDB_EndOfFastaID(const string & str, size_t pos)
2443 {
2444     // (Derived from s_EndOfFastaID()).
2445 
2446     size_t vbar = str.find('|', pos);
2447 
2448     if (vbar == string::npos) {
2449         return string::npos; // bad
2450     }
2451 
2452     string portion(str, pos, vbar - pos);
2453 
2454     CSeq_id::E_Choice choice =
2455         CSeq_id::WhichInverseSeqId(portion.c_str());
2456 
2457     if (choice != CSeq_id::e_not_set) {
2458         size_t vbar_prev = vbar;
2459         int count;
2460         for (count=0; ; ++count, vbar_prev = vbar) {
2461             vbar = str.find('|', vbar_prev + 1);
2462 
2463             if (vbar == string::npos) {
2464                 break;
2465             }
2466 
2467             int start_pt = int(vbar_prev + 1);
2468             string element(str, start_pt, vbar - start_pt);
2469 
2470             choice = CSeq_id::WhichInverseSeqId(element.c_str());
2471 
2472             if (choice != CSeq_id::e_not_set) {
2473                 vbar = vbar_prev;
2474                 break;
2475             }
2476         }
2477     } else {
2478         return string::npos; // bad
2479     }
2480 
2481     return (vbar == string::npos) ? str.size() : vbar;
2482 }
2483 
2484 /// Parse string into a sequence of Seq-id objects.
2485 ///
2486 /// A string is broken down into Seq-ids and the set of Seq-ids is
2487 /// returned.
2488 ///
2489 /// @param line
2490 ///   The string to interpret.
2491 /// @param seqids
2492 ///   The returned set of Seq-id objects.
2493 /// @return
2494 ///   true if any Seq-id objects were found.
2495 
2496 static bool
s_SeqDB_ParseSeqIDs(const string & line,vector<CRef<CSeq_id>> & seqids)2497 s_SeqDB_ParseSeqIDs(const string              & line,
2498                     vector< CRef< CSeq_id > > & seqids)
2499 {
2500     // (Derived from s_ParseFastaDefline()).
2501 
2502     seqids.clear();
2503     size_t pos = 0;
2504 
2505     while (pos < line.size()) {
2506         size_t end = s_SeqDB_EndOfFastaID(line, pos);
2507 
2508         if (end == string::npos) {
2509             // We didn't get a clean parse -- ignore the data after
2510             // this point, and return what we have.
2511             break;
2512         }
2513 
2514         string element(line, pos, end - pos);
2515 
2516         CRef<CSeq_id> id;
2517 
2518         try {
2519             id = new CSeq_id(element);
2520         }
2521         catch(invalid_argument &) {
2522             // Maybe this should be done: "seqids.clear();"
2523             break;
2524         }
2525 
2526         seqids.push_back(id);
2527         pos = end + 1;
2528     }
2529 
2530     return ! seqids.empty();
2531 }
2532 
2533 
2534 
SeqDB_SimplifyAccession(const string & acc,Int8 & num_id,string & str_id,bool & simpler)2535 ESeqDBIdType SeqDB_SimplifyAccession(const string & acc,
2536                                      Int8         & num_id,
2537                                      string       & str_id,
2538                                      bool         & simpler)
2539 {
2540     ESeqDBIdType result = eStringId;
2541     num_id = (Uint4)-1;
2542 
2543     vector< CRef< CSeq_id > > seqid_set;
2544 
2545     if (s_SeqDB_ParseSeqIDs(acc, seqid_set)) {
2546         // Something like SeqIdFindBest()
2547         CRef<CSeq_id> bestid =
2548             FindBestChoice(seqid_set, CSeq_id::BestRank);
2549 
2550         result = SeqDB_SimplifySeqid(*bestid, & acc, num_id, str_id, simpler);
2551     } else {
2552 
2553         // Check for bare pdb accession with underscore (such as 12AS_A). These
2554         // are not in the isam index and need to be translated to the
2555         // standard form (pdb|12AS|A).
2556         list< CRef<CSeq_id> > seqids;
2557         try {
2558             CSeq_id::ParseFastaIds(seqids, acc, false);
2559         }
2560         catch (...) {
2561             seqids.clear();
2562         }
2563 
2564         if (!seqids.empty() && seqids.front()->IsPdb() &&
2565             acc.find("_") != string::npos) {
2566 
2567             str_id = seqids.front()->AsFastaString();
2568             str_id = NStr::ToLower(str_id);
2569         }
2570         else if (!seqids.empty() && seqids.front()->IsLocal()) {
2571         	// Chec for gnl dbs
2572         	if( acc.find(":") != string::npos) {
2573         		static const char* GNL_DBs[] = {"CDD", "SRA", "TSA", "GNOMON", NULL};
2574         		string db_tag, gnl_id;
2575         		NStr::SplitInTwo(acc, ":", db_tag, gnl_id);
2576         		const char** p = GNL_DBs;
2577         		for (;  p  &&  *p;  ++p) {
2578         			if(NStr::EqualNocase(*p, db_tag.c_str())) {
2579         				str_id  = "gnl|" + db_tag + "|"  + gnl_id;
2580         				seqids.front().Reset();
2581         				CRef<CSeq_id> new_id(new CSeq_id(str_id));
2582         				seqids.front() = new_id;
2583         				break;
2584         			}
2585         		}
2586         		if(*p == NULL) {
2587         			str_id =  acc;
2588         		}
2589         	}
2590         	else {
2591         		Int8 n_id = 0;
2592         		if (NStr::StringToNumeric<Int8>(acc,&n_id, NStr::fConvErr_NoThrow)) {
2593         			str_id = "lcl|" + acc;
2594         		}
2595         		else {
2596         			str_id = acc;
2597         		}
2598         	}
2599         }
2600         else {
2601             str_id = acc;
2602         }
2603         result = eStringId;
2604         simpler = false;
2605     }
2606 
2607     return result;
2608 }
2609 
SeqDB_SimplifyAccession(const string & acc)2610 const string SeqDB_SimplifyAccession(const string &acc)
2611 {
2612     Int8 num_id;
2613     string str_id;
2614     bool simpler(false);
2615     ESeqDBIdType result = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
2616     if (result == eStringId) return str_id;
2617     else return "";
2618 }
2619 
SeqDB_GetFileExtensions(bool db_is_protein,vector<string> & extn,EBlastDbVersion dbver)2620 void SeqDB_GetFileExtensions(bool db_is_protein, vector<string>& extn, EBlastDbVersion dbver)
2621 {
2622     // NOTE: If more extensions are added, please keep in sync with
2623     // updatedb.pl's DistributeBlastDbsToBackends
2624     // and Blast.pm's @blastdb_extensions
2625     extn.clear();
2626 
2627     const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2628 
2629     extn.push_back(kExtnMol + "al");   // alias file
2630     extn.push_back(kExtnMol + "in");   // index file
2631     extn.push_back(kExtnMol + "hr");   // header file
2632     extn.push_back(kExtnMol + "sq");   // sequence file
2633     extn.push_back(kExtnMol + "ni");   // ISAM numeric index file
2634     extn.push_back(kExtnMol + "nd");   // ISAM numeric data file
2635     if (dbver == eBDB_Version4) {
2636     	extn.push_back(kExtnMol + "si");   // ISAM string index file
2637     	extn.push_back(kExtnMol + "sd");   // ISAM string data file
2638     }
2639     extn.push_back(kExtnMol + "pi");   // ISAM PIG index file
2640     extn.push_back(kExtnMol + "pd");   // ISAM PIG data file
2641     if (dbver == eBDB_Version5) {
2642     	vector<string> lmdbs;
2643     	SeqDB_GetLMDBFileExtensions(db_is_protein, lmdbs);
2644     	extn.insert(extn.end(), lmdbs.begin(), lmdbs.end());
2645     }
2646     // Contain masking information
2647     extn.push_back(kExtnMol + "aa");   // ISAM mask index file
2648     extn.push_back(kExtnMol + "ab");   // ISAM mask data file (big-endian)
2649     extn.push_back(kExtnMol + "ac");   // ISAM mask data file (little-endian)
2650     extn.push_back(kExtnMol + "og");   // OID to GI file
2651     extn.push_back(kExtnMol + "hi");   // ISAM sequence hash index file
2652     extn.push_back(kExtnMol + "hd");   // ISAM sequence hash data file
2653     extn.push_back(kExtnMol + "ti");   // ISAM trace id index file
2654     extn.push_back(kExtnMol + "td");   // ISAM trace id data file
2655 }
2656 
2657 
SeqDB_GetLMDBFileExtensions(bool db_is_protein,vector<string> & extn)2658 void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector<string>& extn)
2659 {
2660 
2661 	static const char * ext[]={"db", "os", "ot", "tf", "to", "db-lock", "tf-lock", NULL};
2662 	extn.clear();
2663 	const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2664 	for(const char ** p=ext; *p != NULL; p++) {
2665 		extn.push_back(kExtnMol + (*p));
2666 	}
2667 }
2668 
IsStringId(const CSeq_id & id)2669 bool IsStringId(const CSeq_id & id)
2670 {
2671 	 switch(id.Which()) {
2672 	    case CSeq_id::e_Gi:
2673 	    	return false;
2674 	    break;
2675 	    case CSeq_id::e_General:
2676 	    {
2677 	    	const CDbtag & dbt = id.GetGeneral();
2678             if (dbt.CanGetDb() && (dbt.GetDb() == "PIG")) {
2679             	return false;
2680             }
2681 	    }
2682 	    default:
2683 	    	return true;
2684 	    break;
2685 	 };
2686 }
2687 
GetBlastSeqIdString(const CSeq_id & seqid,bool version)2688 string GetBlastSeqIdString(const CSeq_id & seqid, bool version)
2689 {
2690 	if(seqid.IsPir() || seqid.IsPrf()) {
2691 		return seqid.AsFastaString();
2692 	}
2693 
2694 	return seqid.GetSeqIdString(version);
2695 }
2696 
2697 END_NCBI_SCOPE
2698 
2699