1 /*  $Id: seqdbisam.cpp 631527 2021-05-19 13:49:38Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 //IRENA: ALL locking  removed from this file CSeqDBLockHold & stays in some functions but is not used
30 /// @file seqdbisam.cpp
31 /// Implementation for the CSeqDBIsam class, which manages an ISAM
32 /// index of some particular kind of identifiers.
33 #include <ncbi_pch.hpp>
34 #include <objtools/blast/seqdb_reader/impl/seqdbisam.hpp>
35 #include <objects/seqloc/Seq_id.hpp>
36 #include <objects/general/general__.hpp>
37 #include <corelib/ncbiutil.hpp>
38 
39 /// Place these definitions in the ncbi namespace
40 BEGIN_NCBI_SCOPE
41 
42 /// Import this namespace
43 USING_SCOPE(objects);
44 
45 /// Format version of the ISAM files
46 #define ISAM_VERSION 1
47 
48 /// Default page size for numeric indices
49 #define DEFAULT_NISAM_SIZE 256
50 
51 /// Default page size for string indices
52 #define DEFAULT_SISAM_SIZE 64
53 
54 /// Special page size value which indicates a memory-only string index
55 #define MEMORY_ONLY_PAGE_SIZE 1
56 
57 
58 CSeqDBIsam::EErrorCode
x_InitSearch(void)59 CSeqDBIsam::x_InitSearch(void)
60 {
61     if(m_Initialized == true)
62         return eNoError;
63 
64     TIndx info_needed = 10 * sizeof(Int4);
65 
66     bool found_index_file =
67         m_Atlas.GetFileSizeL(m_IndexFname, m_IndexFileLength);
68 
69     if ((! found_index_file) || (m_IndexFileLength < info_needed)) {
70         return eWrongFile;
71     }
72 
73 
74     Int4 * FileInfo = (Int4*)m_IndexLease.GetFileDataPtr(m_IndexFname,0);
75 
76     // Check for consistence of files and parameters
77 
78     Int4 Version = SeqDB_GetStdOrd(& FileInfo[0]);
79 
80     if (Version != ISAM_VERSION)
81         return eBadVersion;
82 
83     Int4 IsamType = SeqDB_GetStdOrd(& FileInfo[1]);
84 
85     if (IsamType == eNumericLongId && m_Type == eNumeric) {
86         m_LongId = true;
87         m_TermSize = 12;
88         IsamType = eNumeric;
89     }
90 
91     if (IsamType != m_Type)
92         return eBadType;
93 
94     m_NumTerms    = SeqDB_GetStdOrd(& FileInfo[3]);
95     m_NumSamples  = SeqDB_GetStdOrd(& FileInfo[4]);
96     m_PageSize    = SeqDB_GetStdOrd(& FileInfo[5]);
97     m_MaxLineSize = SeqDB_GetStdOrd(& FileInfo[6]);
98 
99     if(m_PageSize != MEMORY_ONLY_PAGE_SIZE) {
100         // Special case of memory-only index
101         m_DataFileLength = SeqDB_GetStdOrd(& FileInfo[2]);
102 
103         TIndx disk_file_length(0);
104         bool found_data_file =
105             m_Atlas.GetFileSizeL(m_DataFname, disk_file_length);
106 
107         if ((! found_data_file) || (m_DataFileLength != disk_file_length)) {
108             return eWrongFile;
109         }
110     }
111 
112     // This space reserved for future use
113 
114     m_IdxOption = SeqDB_GetStdOrd(& FileInfo[7]);
115 
116     m_KeySampleOffset = (9 * sizeof(Int4));
117 
118     m_Initialized = true;
119 
120     return eNoError;
121 }
122 
x_GetPageNumElements(Int4 sample_num,Int4 * start)123 Int4 CSeqDBIsam::x_GetPageNumElements(Int4   sample_num,
124                                       Int4 * start)
125 {
126     Int4 num_elements(0);
127 
128     *start = sample_num * m_PageSize;
129 
130     if (sample_num + 1 == m_NumSamples) {
131         num_elements = m_NumTerms - *start;
132     } else {
133         num_elements = m_PageSize;
134     }
135 
136     return num_elements;
137 }
138 
139 CSeqDBIsam::EErrorCode
x_SearchIndexNumeric(Int8 Number,int * Data,Uint4 * Index,Int4 & SampleNum,bool & done)140 CSeqDBIsam::x_SearchIndexNumeric(Int8             Number,
141                                  int            * Data,
142                                  Uint4          * Index,
143                                  Int4           & SampleNum,
144                                  bool           & done)
145 
146 {
147     if(m_Initialized == false) {
148         done = true;
149         // Return just any error
150         return eInitFailed;
151     }
152 
153     if (x_OutOfBounds(Number)) {
154         done = true;
155         return eNotFound;
156     }
157 
158     _ASSERT(m_Type != eNumericNoData);
159 
160     // Search the sample file.
161 
162     Int4 Start     (0);
163     Int4 Stop      (m_NumSamples - 1);
164 
165     while(Stop >= Start) {
166         SampleNum = ((Uint4)(Stop + Start)) >> 1;
167 
168         TIndx offset_begin = m_KeySampleOffset + (m_TermSize * SampleNum);
169         //TIndx offset_end   = offset_begin + m_TermSize;
170 
171         const void* keydatap(0);
172 
173         Int8 Key(0);
174 
175         keydatap = m_IndexLease.GetFileDataPtr(m_IndexFname,offset_begin);
176         Key = x_GetNumericKey (keydatap);
177 
178         // If this is an exact match, return the master term number.
179 
180         if (Key == Number) {
181             if (Data != NULL) {
182                 *Data = x_GetNumericData(keydatap);
183             }
184 
185             if (Index != NULL)
186                 *Index = SampleNum * m_PageSize;
187 
188             done = true;
189             return eNoError;
190         }
191 
192         // Otherwise, search for the next sample.
193 
194         if ( Number < Key )
195             Stop = --SampleNum;
196         else
197             Start = SampleNum +1;
198     }
199 
200     // If the term is out of range altogether, report not finding it.
201 
202     if ( (SampleNum < 0) || (SampleNum >= m_NumSamples)) {
203 
204         if (Data != NULL)
205             *Data = eNotFound;
206 
207         if(Index != NULL)
208             *Index = eNotFound;
209 
210         done = true;
211         return eNotFound;
212     }
213 
214     done = false;
215     return eNoError;
216 }
217 
218 void
x_SearchNegativeMulti(int vol_start,int vol_end,CSeqDBNegativeList & ids,bool use_tis)219 CSeqDBIsam::x_SearchNegativeMulti(int                  vol_start,
220                                   int                  vol_end,
221                                   CSeqDBNegativeList & ids,
222                                   bool                 use_tis)
223 
224 {
225     if(m_Initialized == false) {
226         NCBI_THROW(CSeqDBException,
227                       eArgErr,
228                       "Error: Unable to use ISAM index in batch mode.");
229     }
230 
231     //m_Atlas.Lock(locked);
232 
233 
234     // We can use Parabolic Binary Search for the negative GI list but
235     // not for the ISAM file data, because in the negative ID list
236     // case, every line of the ISAM data must be looked at.
237 
238     _ASSERT(m_Type != eNumericNoData);
239 
240     //......................................................................
241     //
242     // Translate the entire Gi List.
243     //
244     //......................................................................
245 
246     int gilist_size = use_tis ? ids.GetNumTis() : ids.GetNumGis();
247 
248     int gilist_index = 0;
249 
250     int sample_index(0);
251     const void * data_page (0);
252 
253     while(sample_index < m_NumSamples) {
254         int start = 0, num_elements = 0;
255 
256         x_MapDataPage(sample_index,
257                       start,
258                       num_elements,
259                       & data_page);
260 
261         for(int i = 0; i < num_elements; i++) {
262             Int8 isam_key(0);
263             int isam_data(0);
264 
265             // 1. Get the ID+OID from the data page.
266 
267             x_GetDataElement(data_page,
268                              i,
269                              isam_key,
270                              isam_data);
271 
272             // 2. Look for it in the negative id list.
273 
274             bool found = false;
275 
276             if (gilist_index < gilist_size) {
277                 found = x_FindInNegativeList(ids,
278                                              gilist_index,
279                                              isam_key,
280                                              use_tis);
281             }
282 
283             // 3. If not found, add the OID to the negative ID list.
284 
285             if (isam_data < vol_end) {
286                 if (found) {
287                     // OID is found, but may not be included yet.
288                     ids.AddVisibleOid(isam_data + vol_start);
289                 } else {
290                     // OID is included for iteration.
291                     ids.AddIncludedOid(isam_data + vol_start);
292                 }
293             }
294         }
295 
296         // Move to next data page.  Note that for a negative ID list
297         // processing, we don't actually fetch any samples, because
298         // every ID->OID line needs to be examined anyway.
299 
300         sample_index ++;
301     }
302 }
303 
304 //In case if acc2 does not have version and acc1 has version
305 //check if it is the same accession
s_IsSameAccession(string acc1,string acc2)306 static bool s_IsSameAccession(string acc1, string acc2)
307 {
308     bool sameAccession = false;
309     if(NStr::Find(acc2,".") == NPOS) { // no version in acc2
310         if(NStr::Find(acc1,".") != NPOS && NStr::Find(acc1, acc2) != NPOS) {
311             string accession, version;
312             NStr::SplitInTwo(acc1,".", accession, version);
313             if(acc2 == accession) {
314                 sameAccession = true;
315             }
316         }
317     }
318     return sameAccession;
319 }
320 
321 //In case if keys[currIndex] does not have version and keys[currIndex + 1] has version
322 //check if it is the same accession
s_IsSameAccession(vector<string> keys,int num_keys,int currIndex)323 static bool s_IsSameAccession(vector <string> keys, int num_keys, int currIndex)
324 {
325     bool sameAccession = false;
326     if(currIndex < num_keys - 1) {
327         sameAccession = s_IsSameAccession(keys[currIndex + 1], keys[currIndex]);
328     }
329     return sameAccession;
330 }
331 
332 void
x_SearchNegativeMultiSeq(int vol_start,int vol_end,CSeqDBNegativeList & ids)333 CSeqDBIsam::x_SearchNegativeMultiSeq(int              vol_start,
334                                      int              vol_end,
335                                      CSeqDBNegativeList & ids)
336 
337 {
338         int gilist_size = ids.ListSize();
339         if (! gilist_size) return;
340 
341         if(m_Initialized == false) {
342             // Most ordinary errors (missing GIs for example) are
343             // ignored for "multi" mode searches.  But if a GI list is
344             // specified, and cannot be interpreted, it is an error.
345 
346             NCBI_THROW(CSeqDBException,
347                        eArgErr,
348                        "Error: Unable to use ISAM index in batch mode.");
349         }
350 
351 
352         vector<string> sample_keys;
353         vector<TIndx> page_offs;
354         vector<string> keys;
355         vector<int> vals;
356 
357         sample_keys.reserve(m_NumSamples);
358         page_offs.reserve(m_NumSamples + 1);
359         keys.reserve(m_PageSize);
360         vals.reserve(m_PageSize);
361 
362 
363         x_LoadIndex(m_IndexLease, sample_keys, page_offs);
364 
365         int gilist_index = 0;
366         int sample_index = 0;
367 
368         while(sample_index < m_NumSamples) {
369 
370             // Now we should be ready to search a data block.
371             keys.clear();
372             vals.clear();
373 
374             int num_keys = m_PageSize;
375             if (sample_index + 1 == m_NumSamples) {
376                 num_keys = m_NumTerms - sample_index * m_PageSize;
377             }
378 
379             x_LoadData(m_DataLease, keys, vals, num_keys, page_offs[sample_index]);
380 
381             for(int i = 0; i < num_keys; i++) {
382                 // 2. Look for it in the negative id list.
383 
384                 bool found = false;
385                 if (gilist_index < gilist_size) {
386                     found = x_FindInNegativeList(ids,
387                                              gilist_index,
388                                              keys[i]);
389 
390                 }
391                 if (vals[i] < vol_end) {
392                     if (found) {
393                         // OID is found, but may not be included yet.
394                         ids.AddVisibleOid(vals[i] + vol_start);
395                         //If next accession is the same as current, but with version
396                         if(s_IsSameAccession(keys, num_keys, i)) {
397                             i++; //skip next check - sequence already excluded
398                         }
399                     } else {
400                         // OID is included for iteration.
401                         //Only include next accession if it is not the same as current (but with version)
402                         //because it may be in exclude list. The check will be done in the next step
403                         if(!s_IsSameAccession(keys, num_keys, i)) {
404                             ids.AddIncludedOid(vals[i] + vol_start);
405                         }
406                     }
407                 }
408 
409             }
410             // Move to next data page.  Note that for a negative ID list
411             // processing, we don't actually fetch any samples, because
412             // every ID->OID line needs to be examined anyway.
413 
414             sample_index ++;
415 
416         }
417 }
418 
419 
420 CSeqDBIsam::EErrorCode
x_SearchDataNumeric(Int8 Number,int * Data,Uint4 * Index,Int4 SampleNum)421 CSeqDBIsam::x_SearchDataNumeric(Int8             Number,
422                                 int            * Data,
423                                 Uint4          * Index,
424                                 Int4             SampleNum)
425 
426 {
427     // Load the appropriate page of numbers into memory.
428     _ASSERT(m_Type != eNumericNoData);
429 
430     Int4 Start(0);
431     Int4 NumElements = x_GetPageNumElements(SampleNum, & Start);
432 
433     Int4 first = Start;
434     Int4 last  = Start + NumElements - 1;
435 
436     const void * KeyDataPage      = NULL;
437     const void * KeyDataPageStart = NULL;
438 
439     TIndx offset_begin = Start * m_TermSize;
440     //TIndx offset_end = offset_begin + m_TermSize * NumElements;
441 
442     KeyDataPageStart = m_DataLease.GetFileDataPtr(m_DataFname,offset_begin);
443 
444 
445     KeyDataPage = (char *)KeyDataPageStart - Start * m_TermSize;
446 
447     bool found   (false);
448     Int4 current (0);
449 
450     // Search the page for the number.
451     while (first <= last) {
452         current = (first+last)/2;
453 
454         Int8 Key = x_GetNumericKey((char *)KeyDataPage + current * m_TermSize);
455 
456         if (Key > Number) {
457             last = --current;
458         } else if (Key < Number) {
459             first = ++current;
460         } else {
461             found = true;
462             break;
463         }
464     }
465 
466     if (found == false) {
467         if (Data != NULL)
468             *Data = eNotFound;
469 
470         if(Index != NULL)
471             *Index = eNotFound;
472 
473         return eNotFound;
474     }
475 
476     if (Data != NULL) {
477         *Data = x_GetNumericData((char *)KeyDataPage + current * m_TermSize);
478     }
479 
480     if(Index != NULL)
481         *Index = Start + current;
482 
483     return eNoError;
484 }
485 
486 
487 // ------------------------NumericSearch--------------------------
488 // Purpose:     Main search function of Numeric ISAM
489 //
490 // Parameters:  Key - interer to search
491 //              Data - returned value (for NIASM with data)
492 //              Index - internal index in database
493 // Returns:     ISAM Error Code
494 // NOTE:        None
495 // ----------------------------------------------------------------
496 
497 CSeqDBIsam::EErrorCode
x_NumericSearch(Int8 Number,int * Data,Uint4 * Index)498 CSeqDBIsam::x_NumericSearch(Int8             Number,
499                             int            * Data,
500                             Uint4          * Index)
501 
502 {
503     bool done      (false);
504     Int4 SampleNum (0);
505 
506     EErrorCode error =
507         x_SearchIndexNumeric(Number, Data, Index, SampleNum, done);
508 
509     if (! done) {
510         error = x_SearchDataNumeric(Number, Data, Index, SampleNum);
511     }
512 
513     return error;
514 }
515 
x_DiffCharLease(const string & term_in,CSeqDBFileMemMap & lease,const string & file_name,TIndx file_length,Uint4 at_least,TIndx KeyOffset,bool ignore_case)516 int CSeqDBIsam::x_DiffCharLease(const string   & term_in,
517                                 CSeqDBFileMemMap & lease,
518                                 const string   & file_name,
519                                 TIndx            file_length,
520                                 Uint4            at_least,
521                                 TIndx            KeyOffset,
522                                 bool             ignore_case)
523 
524 {
525     int result(-1);
526 
527     //m_Atlas.Lock(locked);
528 
529     // Add one to term_end to insure we don't consider "AA" and "AAB"
530     // as equal.
531 
532     TIndx offset_begin = KeyOffset;
533     TIndx term_end     = KeyOffset + term_in.size() + 1;
534     TIndx map_end      = term_end + at_least;
535 
536     if (map_end > file_length) {
537         map_end = file_length;
538 
539         if (term_end > map_end) {
540             term_end = map_end;
541             result = int(file_length - offset_begin);
542         }
543     }
544 
545     const char * file_data = (const char *)lease.GetFileDataPtr(file_name,offset_begin);
546 
547     Int4 dc_result =
548         x_DiffChar(term_in,
549                    file_data,
550                    file_data + term_in.size() + 1,
551                    ignore_case);
552 
553     if (dc_result != -1) {
554         return dc_result;
555     }
556 
557     return result;
558 }
559 
560 /// Return NUL for nulls or EOL characters
561 ///
562 /// This function returns a NUL byte for any of NUL, CR, or NL.  This
563 /// is done because these characters are used to terminate the
564 /// variable length records in a string-based ISAM file.
565 ///
566 /// @param c
567 ///   A character
568 /// @return
569 ///   NUL or the same character
570 static inline char
s_SeqDBIsam_NullifyEOLs(char c)571 s_SeqDBIsam_NullifyEOLs(char c)
572 {
573     if (SEQDB_ISEOL(c)) {
574         return 0;
575     } else {
576         return c;
577     }
578 }
579 
580 /// The terminating character for string ISAM keys when data is present.
581 const char ISAM_DATA_CHAR = (char) 2;
582 
583 /// Returns true if the character is a terminator for an ISAM key.
ENDS_ISAM_KEY(char P)584 static inline bool ENDS_ISAM_KEY(char P)
585 {
586     return (P == ISAM_DATA_CHAR) || (s_SeqDBIsam_NullifyEOLs(P) == 0);
587 }
588 
x_DiffChar(const string & term_in,const char * begin,const char * end,bool ignore_case)589 Int4 CSeqDBIsam::x_DiffChar(const string & term_in,
590                             const char   * begin,
591                             const char   * end,
592                             bool           ignore_case)
593 {
594     int result(-1);
595     int i(0);
596 
597     const char * file_data = begin;
598     int bytes = int(end - begin);
599 
600     for(i = 0; (i < bytes) && i < (int) term_in.size(); i++) {
601         char ch1 = term_in[i];
602         char ch2 = file_data[i];
603 
604         if (ch1 != ch2) {
605             ch1 = s_SeqDBIsam_NullifyEOLs(ch1);
606             ch2 = s_SeqDBIsam_NullifyEOLs(ch2);
607 
608             if (ignore_case) {
609                 ch1 = toupper((unsigned char) ch1);
610                 ch2 = toupper((unsigned char) ch2);
611             }
612 
613             if (ch1 != ch2) {
614                 break;
615             }
616         }
617     }
618 
619     const char * p = file_data + i;
620 
621     while((p < end) && ((*p) == ' ')) {
622         p++;
623     }
624 
625     if (((p == end) || ENDS_ISAM_KEY(*p)) && (i == (int) term_in.size())) {
626         result = -1;
627     } else {
628         result = i;
629     }
630 
631     return result;
632 }
633 
x_ExtractPageData(const string & term_in,TIndx page_index,const char * beginp,const char * endp,vector<TIndx> & indices_out,vector<string> & keys_out,vector<string> & data_out)634 void CSeqDBIsam::x_ExtractPageData(const string   & term_in,
635                                    TIndx            page_index,
636                                    const char     * beginp,
637                                    const char     * endp,
638                                    vector<TIndx>  & indices_out,
639                                    vector<string> & keys_out,
640                                    vector<string> & data_out)
641 {
642     // Collect all 'good' data from the page.
643 
644     bool ignore_case = true;
645 
646     Uint4 TermNum(0);
647 
648     const char * indexp(beginp);
649     bool found_match(false);
650 
651     while (indexp < endp) {
652         Int4 Diff = x_DiffChar(term_in,
653                                indexp,
654                                endp,
655                                ignore_case);
656 
657         if (Diff == -1) { // Complete match
658             found_match = true;
659 
660             x_ExtractData(indexp,
661                           endp,
662                           keys_out,
663                           data_out);
664 
665             indices_out.push_back(page_index + TermNum);
666         } else {
667             // If we found a match, but the current term doesn't
668             // match, then we are past the set of matching entries.
669 
670             if (found_match) {
671                 break;
672             }
673         }
674 
675         // Skip remainder of term, and any nulls after it.
676 
677         while((indexp < endp) && s_SeqDBIsam_NullifyEOLs(*indexp)) {
678             indexp++;
679         }
680         while((indexp < endp) && (! s_SeqDBIsam_NullifyEOLs(*indexp))) {
681             indexp++;
682         }
683 
684         TermNum++;
685     }
686 }
687 
x_ExtractAllData(const string & term_in,TIndx sample_index,vector<TIndx> & indices_out,vector<string> & keys_out,vector<string> & data_out)688 void CSeqDBIsam::x_ExtractAllData(const string   & term_in,
689                                   TIndx            sample_index,
690                                   vector<TIndx>  & indices_out,
691                                   vector<string> & keys_out,
692                                   vector<string> & data_out)
693 
694 {
695     // The object at sample_index is known to match; we will iterate
696     // over the surrounding values to see if they match as well.  No
697     // assumptions about how many keys can match are made here.
698 
699     bool ignore_case = true;
700 
701     int pre_amt  = 1;
702     int post_amt = 1;
703 
704     bool done_b(false), done_e(false);
705 
706     const char * beginp(0);
707     const char * endp(0);
708 
709     TIndx beg_off(0);
710     TIndx end_off(0);
711 
712     while(! (done_b && done_e)) {
713         if (sample_index < pre_amt) {
714             beg_off = 0;
715             done_b = true;
716         } else {
717             beg_off = sample_index - pre_amt;
718         }
719 
720         if ((m_NumSamples - sample_index) < post_amt) {
721             end_off = m_NumSamples;
722             done_e = true;
723         } else {
724             end_off = sample_index + post_amt;
725         }
726 
727         x_LoadPage(beg_off, end_off, & beginp, & endp);
728 
729         if (! done_b) {
730             Int4 diff_begin = x_DiffChar(term_in,
731                                          beginp,
732                                          endp,
733                                          ignore_case);
734 
735             if (diff_begin != -1) {
736                 done_b = true;
737             } else {
738                 pre_amt ++;
739             }
740         }
741 
742         if (! done_e) {
743             const char * last_term(0);
744             const char * p(endp-1);
745 
746             // Skip over any non-terminating junk at the end
747 
748             enum { eEndNulls, eLastTerm } search_stage = eEndNulls;
749 
750             while(p > beginp) {
751                 bool terminal = (0 == s_SeqDBIsam_NullifyEOLs(*p));
752 
753                 if (search_stage == eEndNulls) {
754                     if (! terminal) {
755                         search_stage = eLastTerm;
756                     }
757                 } else {
758                     if (terminal) {
759                         last_term = p + 1;
760                         break;
761                     }
762                 }
763 
764                 p--;
765             }
766 
767             if (! last_term) {
768                 last_term = beginp;
769             }
770 
771             Int4 diff_end = x_DiffChar(term_in,
772                                        last_term,
773                                        endp,
774                                        ignore_case);
775 
776             if (diff_end != -1) {
777                 done_e = true;
778             } else {
779                 post_amt ++;
780             }
781         }
782     }
783 
784     x_ExtractPageData(term_in,
785                       m_PageSize * beg_off,
786                       beginp,
787                       endp,
788                       indices_out,
789                       keys_out,
790                       data_out);
791 }
792 
x_ExtractData(const char * key_start,const char * map_end,vector<string> & keys_out,vector<string> & data_out)793 void CSeqDBIsam::x_ExtractData(const char     * key_start,
794                                const char     * map_end,
795                                vector<string> & keys_out,
796                                vector<string> & data_out)
797 {
798     const char * data_ptr(0);
799     const char * p(key_start);
800 
801     while(p < map_end) {
802         switch(s_SeqDBIsam_NullifyEOLs(*p)) {
803         case 0:
804             if (data_ptr) {
805                 keys_out.push_back(string(key_start, data_ptr));
806                 data_out.push_back(string(data_ptr+1, p));
807             } else {
808                 keys_out.push_back(string(key_start, p));
809                 data_out.push_back("");
810             }
811             return;
812 
813         case ISAM_DATA_CHAR:
814             data_ptr = p;
815 
816         default:
817             p++;
818         }
819     }
820 }
821 
822 CSeqDBIsam::TIndx
x_GetIndexKeyOffset(TIndx sample_offset,Uint4 sample_num)823 CSeqDBIsam::x_GetIndexKeyOffset(TIndx            sample_offset,
824                                 Uint4            sample_num)
825 
826 {
827     TIndx offset_begin = sample_offset + (sample_num * sizeof(Uint4));
828     //TIndx offset_end   = offset_begin + sizeof(Uint4);
829 
830 
831     Int4 * key_offset_addr = (Int4 *)m_IndexLease.GetFileDataPtr(offset_begin);
832     return SeqDB_GetStdOrd(key_offset_addr);
833 }
834 
835 void
x_GetIndexString(TIndx key_offset,int length,string & str,bool trim_to_null)836 CSeqDBIsam::x_GetIndexString(TIndx            key_offset,
837                              int              length,
838                              string         & str,
839                              bool             trim_to_null)
840 
841 {
842     //TIndx offset_end = key_offset + length;
843 
844     const char * key_offset_addr =
845         (const char *)m_IndexLease.GetFileDataPtr(key_offset);
846 
847 
848     if (trim_to_null) {
849         for(int i = 0; i<length; i++) {
850             if (! key_offset_addr[i]) {
851                 length = i;
852                 break;
853             }
854         }
855     }
856 
857     str.assign(key_offset_addr, length);
858 }
859 
860 // Given an index, this computes the diff from the input term.  It
861 // also returns the offset for that sample's key in KeyOffset.
862 
x_DiffSample(const string & term_in,Uint4 SampleNum,TIndx & KeyOffset)863 int CSeqDBIsam::x_DiffSample(const string   & term_in,
864                              Uint4            SampleNum,
865                              TIndx          & KeyOffset)
866 
867 {
868     // Meaning:
869     // a. Compute SampleNum*4
870     // b. Address this number into SamplePos (indexlease)
871     // c. Swap this number to compute Key offset.
872     // d. Add to beginning of file to get key data pointer.
873 
874     bool ignore_case(true);
875 
876     TIndx SampleOffset(m_KeySampleOffset);
877 
878     if(m_PageSize != MEMORY_ONLY_PAGE_SIZE) {
879         SampleOffset += (m_NumSamples + 1) * sizeof(Uint4);
880     }
881 
882     TIndx offset_begin = SampleOffset + (SampleNum * sizeof(Uint4));
883     //TIndx offset_end   = offset_begin + sizeof(Uint4);
884 
885     KeyOffset = SeqDB_GetStdOrd((Int4*)m_IndexLease.GetFileDataPtr(offset_begin));
886 
887     Uint4 max_lines_2 = m_MaxLineSize * 2;
888 
889     return x_DiffCharLease(term_in,
890                            m_IndexLease,
891                            m_IndexFname,
892                            m_IndexFileLength,
893                            max_lines_2,
894                            KeyOffset,
895                            ignore_case);
896 
897 }
898 
x_LoadPage(TIndx SampleNum1,TIndx SampleNum2,const char ** beginp,const char ** endp)899 void CSeqDBIsam::x_LoadPage(TIndx             SampleNum1,
900                             TIndx             SampleNum2,
901                             const char     ** beginp,
902                             const char     ** endp)
903 
904 {
905     // Load the appropriate page of terms into memory.
906 
907     _ASSERT(SampleNum2 > SampleNum1);
908 
909     TIndx begin_offset = m_KeySampleOffset + SampleNum1       * sizeof(Uint4);
910     //TIndx end_offset   = m_KeySampleOffset + (SampleNum2 + 1) * sizeof(Uint4);
911 
912     Uint4 * key_offsets((Uint4*)m_IndexLease.GetFileDataPtr(begin_offset));
913 
914 
915     Uint4 key_off1 = SeqDB_GetStdOrd(& key_offsets[0]);
916     Uint4 key_off2 = SeqDB_GetStdOrd(& key_offsets[SampleNum2 - SampleNum1]);
917 
918     *beginp = (const char *) m_DataLease.GetFileDataPtr(m_DataFname,key_off1);
919     *endp   = (const char *) m_DataLease.GetFileDataPtr(key_off2);
920 }
921 
922 
923 // ------------------------StringSearch--------------------------
924 // Purpose:     Main search function of string search.
925 //
926 // Parameters:  Key - interer to search
927 //              Data - returned value
928 //              Index - internal index in database
929 // Returns:     ISAM Error Code
930 // NOTE:        None
931 // --------------------------------------------------------------
932 
933 CSeqDBIsam::EErrorCode
x_StringSearch(const string & term_in,vector<string> & terms_out,vector<string> & values_out,vector<TIndx> & indices_out)934 CSeqDBIsam::x_StringSearch(const string   & term_in,
935                            vector<string> & terms_out,
936                            vector<string> & values_out,
937                            vector<TIndx>  & indices_out)
938 
939 {
940     // These are always false; They may relate to the prior find_one /
941     // expand_to_many method of getting multiple OIDs.
942 
943     bool short_match(false);
944     bool follow_match(false);
945 
946     size_t preexisting_data_count = values_out.size();
947 
948     if (m_Initialized == false) {
949         return eInitFailed;
950     }
951 
952     if (x_OutOfBounds(term_in)) {
953         return eNotFound;
954     }
955 
956     // We will set this option to avoid more complications
957     bool ignore_case = true;
958 
959     // search the sample file first
960 
961     TIndx Start(0);
962     TIndx Stop(m_NumSamples - 1);
963 
964     int Length = (int) term_in.size();
965 
966     TIndx SampleOffset(m_KeySampleOffset);
967 
968     if(m_PageSize != MEMORY_ONLY_PAGE_SIZE) {
969         SampleOffset += (m_NumSamples + 1) * sizeof(Uint4);
970     }
971 
972     int found_short(-1);
973 
974     string short_term;
975     int SampleNum(-1);
976 
977     while(Stop >= Start) {
978         SampleNum = ((Uint4)(Stop + Start)) >> 1;
979 
980         TIndx KeyOffset(0);
981 
982         int diff = x_DiffSample(term_in, SampleNum, KeyOffset);
983 
984         // If this is an exact match, return the master term number.
985 
986         const char * KeyData = (const char *)m_IndexLease.GetFileDataPtr(KeyOffset);
987         TIndx BytesToEnd = m_IndexFileLength - KeyOffset;
988 
989         Uint4 max_lines_2 = m_MaxLineSize * 2;
990 
991         if (BytesToEnd > (TIndx) max_lines_2) {
992             BytesToEnd = max_lines_2;
993         }
994 
995         if (diff == -1) {
996             x_ExtractAllData(term_in,
997                              SampleNum,
998                              indices_out,
999                              terms_out,
1000                              values_out);
1001 
1002 
1003             return eNoError;
1004         }
1005 
1006         // If the key is a superset of the sample term, backup until
1007         // just before the term.
1008 
1009         if (short_match && (diff >= Length)) {
1010             if (SampleNum > 0)
1011                 SampleNum--;
1012 
1013             while(SampleNum > 0) {
1014                 TIndx key_offset =
1015                     x_GetIndexKeyOffset(SampleOffset,
1016                                         SampleNum);
1017 
1018 
1019                 string prefix;
1020                 x_GetIndexString(key_offset, Length, prefix, false);
1021 
1022                 if (ignore_case) {
1023                     if (NStr::CompareNocase(prefix, term_in) != 0) {
1024                         break;
1025                     }
1026                 } else {
1027                     if (prefix != term_in) {
1028                         break;
1029                     }
1030                 }
1031 
1032                 SampleNum--;
1033             }
1034 
1035             found_short = SampleNum + 1;
1036 
1037             TIndx key_offset =
1038                 x_GetIndexKeyOffset(SampleOffset,
1039                                     SampleNum + 1);
1040 
1041 
1042             string prefix;
1043             x_GetIndexString(key_offset, max_lines_2, short_term, true);
1044 
1045             break;
1046         } else {
1047             // If preceding is desired, note the key.
1048 
1049             if (follow_match) {
1050                 found_short = SampleNum;
1051 
1052                 x_GetIndexString(KeyOffset, max_lines_2, short_term, true);
1053             }
1054         }
1055 
1056         // Otherwise, search for the next sample.
1057 
1058         if (ignore_case
1059             ? tolower((unsigned char) term_in[diff]) < tolower((unsigned char) KeyData[diff])
1060             : term_in[diff] < KeyData[diff]) {
1061             Stop = --SampleNum;
1062         } else {
1063             Start = SampleNum + 1;
1064         }
1065     }
1066 
1067 
1068     // If the term is out of range altogether, report not finding it.
1069 
1070     if ( (SampleNum < 0) || (SampleNum >= m_NumSamples)) {
1071         return eNotFound;
1072     }
1073 
1074     // Load the appropriate page of terms into memory.
1075 
1076     const char * beginp(0);
1077     const char * endp(0);
1078 
1079     x_LoadPage(SampleNum, SampleNum + 1, & beginp, & endp);
1080 
1081     // Search the page for the term.
1082 
1083     x_ExtractPageData(term_in,
1084                       m_PageSize * SampleNum,
1085                       beginp,
1086                       endp,
1087                       indices_out,
1088                       terms_out,
1089                       values_out);
1090 
1091     // For now the short and follow logic is not implemented.
1092 
1093     EErrorCode rv(eNoError);
1094 
1095     if (preexisting_data_count == values_out.size()) {
1096         rv = eNotFound;
1097     }
1098 
1099     return rv;
1100 }
1101 
CSeqDBIsam(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl,char file_ext_char,ESeqDBIdType ident_type)1102 CSeqDBIsam::CSeqDBIsam(CSeqDBAtlas  & atlas,
1103                        const string & dbname,
1104                        char           prot_nucl,
1105                        char           file_ext_char,
1106                        ESeqDBIdType   ident_type)
1107     : m_Atlas          (atlas),
1108       m_IdentType      (ident_type),
1109       m_IndexLease     (atlas),
1110       m_DataLease      (atlas),
1111       m_Type           (eNumeric),
1112       m_NumTerms       (0),
1113       m_NumSamples     (0),
1114       m_PageSize       (0),
1115       m_MaxLineSize    (0),
1116       m_IdxOption      (0),
1117       m_Initialized    (false),
1118       m_KeySampleOffset(0),
1119       m_TestNonUnique  (true),
1120       m_FileStart      (0),
1121       m_FirstOffset    (0),
1122       m_LastOffset     (0),
1123       m_LongId         (false),
1124       m_TermSize       (8)
1125 {
1126     // These are the types that readdb.c seems to use.
1127 
1128     switch(ident_type) {
1129     case eGiId:
1130     case ePigId:
1131     case eTiId:
1132         m_Type = eNumeric;
1133         break;
1134 
1135     case eStringId:
1136     case eHashId:
1137         m_Type = eString;
1138         break;
1139 
1140     default:
1141         NCBI_THROW(CSeqDBException,
1142                    eArgErr,
1143                    "Error: ident type argument not valid");
1144     }
1145 
1146     x_MakeFilenames(dbname,
1147                     prot_nucl,
1148                     file_ext_char,
1149                     m_IndexFname,
1150                     m_DataFname);
1151 
1152     if (! (CFile(m_IndexFname).Exists() &&
1153            CFile(m_DataFname).Exists()) ) {
1154 
1155         string msg("Error: Could not open input file (");
1156         msg += m_IndexFname + "/" + m_DataFname + ")";
1157         NCBI_THROW(CSeqDBException, eFileErr, msg);
1158     }
1159     m_IndexLease.Init(m_IndexFname);
1160     m_DataLease.Init(m_DataFname);
1161     if(m_Type == eNumeric) {
1162         m_PageSize = DEFAULT_NISAM_SIZE;
1163     } else {
1164         m_PageSize = DEFAULT_SISAM_SIZE;
1165     }
1166     if (eNoError !=x_InitSearch()) {
1167     	m_Initialized = false;
1168     }
1169     x_FindIndexBounds();
1170 }
1171 
x_MakeFilenames(const string & dbname,char prot_nucl,char file_ext_char,string & index_name,string & data_name)1172 void CSeqDBIsam::x_MakeFilenames(const string & dbname,
1173                                  char           prot_nucl,
1174                                  char           file_ext_char,
1175                                  string       & index_name,
1176                                  string       & data_name)
1177 {
1178     if (dbname.empty() ||
1179         (! isalpha((unsigned char) prot_nucl)) ||
1180         (! isalpha((unsigned char) file_ext_char))) {
1181 
1182         NCBI_THROW(CSeqDBException,
1183                    eArgErr,
1184                    "Error: argument not valid");
1185     }
1186 
1187     index_name.reserve(dbname.size() + 4);
1188     data_name.reserve(dbname.size() + 4);
1189 
1190     index_name = dbname;
1191     index_name += '.';
1192     index_name += prot_nucl;
1193     index_name += file_ext_char;
1194 
1195     data_name = index_name;
1196     index_name += 'i';
1197     data_name  += 'd';
1198 }
1199 
IndexExists(const string & dbname,char prot_nucl,char file_ext_char)1200 bool CSeqDBIsam::IndexExists(const string & dbname,
1201                              char           prot_nucl,
1202                              char           file_ext_char)
1203 {
1204     string iname, dname;
1205     x_MakeFilenames(dbname, prot_nucl, file_ext_char, iname, dname);
1206 
1207     return CFile(iname).Exists() && CFile(dname).Exists();
1208 }
1209 
~CSeqDBIsam()1210 CSeqDBIsam::~CSeqDBIsam()
1211 {
1212     UnLease();
1213 }
1214 //Remove this
UnLease()1215 void CSeqDBIsam::UnLease()
1216 {
1217     m_IndexLease.Clear();
1218     m_DataLease.Clear();
1219 }
1220 
x_IdentToOid(Int8 ident,TOid & oid)1221 bool CSeqDBIsam::x_IdentToOid(Int8 ident, TOid & oid)
1222 {
1223     EErrorCode err =
1224         x_NumericSearch(ident, & oid, 0);
1225 
1226     if (err == eNoError) {
1227         return true;
1228     }
1229 
1230     oid = -1u;  /* NCBI_FAKE_WARNING */
1231 
1232     return false;
1233 }
1234 
StringToOids(const string & acc,vector<TOid> & oids,bool adjusted,bool & version_check)1235 void CSeqDBIsam::StringToOids(const string   & acc,
1236                               vector<TOid>   & oids,
1237                               bool             adjusted,
1238                               bool           & version_check)
1239 
1240 {
1241     bool strip_version = version_check;
1242     version_check = false;
1243 
1244     _ASSERT(m_IdentType == eStringId);
1245 
1246     if(m_Initialized == false) {
1247             return;
1248     }
1249 
1250     bool found = false;
1251 
1252     string accession(string("gb|") + acc + "|");
1253     string locus_str(string("gb||") + acc);
1254 
1255     EErrorCode err = eNoError;
1256 
1257     vector<string> keys_out;
1258     vector<string> data_out;
1259     vector<TIndx>  indices_out;
1260 
1261     if (! adjusted) {
1262         if ((err = x_StringSearch(accession,
1263                                   keys_out,
1264                                   data_out,
1265                                   indices_out)) < 0) {
1266             return;
1267         }
1268 
1269         if (err == eNoError) {
1270             found = true;
1271         }
1272 
1273         if ((! found) &&
1274             (err = x_StringSearch(locus_str,
1275                                   keys_out,
1276                                   data_out,
1277                                   indices_out)) < 0) {
1278 
1279             return;
1280         }
1281 
1282         if (err != eNotFound) {
1283             found = true;
1284         }
1285     }
1286 
1287     if ((! found) &&
1288         (err = x_StringSearch(acc,
1289                               keys_out,
1290                               data_out,
1291                               indices_out)) < 0) {
1292 
1293 
1294         return;
1295     }
1296 
1297     if (err != eNotFound) {
1298         found = true;
1299     }
1300 
1301     if ((! found) && strip_version) {
1302         size_t pos = acc.find(".");
1303 
1304         bool is_version = false;
1305 
1306         if (pos != string::npos) {
1307             int ver_len = acc.size() - pos - 1;
1308 
1309             is_version = (ver_len <= 3 && ver_len >= 1);
1310 
1311             for(size_t vp = pos+1; vp < acc.size(); vp++) {
1312                 if (! isdigit(acc[vp])) {
1313                     is_version = false;
1314                     break;
1315                 }
1316             }
1317         }
1318 
1319         if (is_version) {
1320             string nover(acc, 0, pos);
1321 
1322             err = x_StringSearch(nover,
1323                                  keys_out,
1324                                  data_out,
1325                                  indices_out);
1326 
1327 
1328             if (data_out.size()) {
1329                 version_check = true;
1330             }
1331 
1332             if (err < 0) {
1333                 return;
1334             }
1335         }
1336     }
1337 
1338     if (err != eNotFound) {
1339         found = true;
1340     }
1341 
1342     if (! found) {
1343         // Use CSeq_id to parse the id string and build a replacement,
1344         // FASTA type string.  This allows some IDs, such as PDBs with
1345         // chains, such as '1qcfA' to be parsed.
1346 
1347         string id;
1348 
1349         try {
1350             CSeq_id seqid(acc, CSeq_id::fParse_RawText | CSeq_id::fParse_AnyLocal);
1351             id = seqid.AsFastaString();
1352         }
1353         catch(CSeqIdException &) {
1354         }
1355 
1356         if (id.size() &&
1357             ((err = x_StringSearch(id,
1358                                    keys_out,
1359                                    data_out,
1360                                    indices_out)) < 0)) {
1361 
1362             return;
1363         }
1364     }
1365 
1366     if (err != eNotFound) {
1367         found = true;
1368     }
1369 
1370     if (found) {
1371         ITERATE(vector<string>, iter, data_out) {
1372             oids.push_back(atoi((*iter).c_str()));
1373         }
1374     }
1375 }
1376 
x_SparseStringToOids(const string &,vector<int> &,bool)1377 bool CSeqDBIsam::x_SparseStringToOids(const string   &,
1378                                       vector<int>    &,
1379                                       bool)
1380 
1381 {
1382     cerr << " this should be derived from readdb_acc2fastaEx().." << endl;
1383     _TROUBLE;
1384     return false;
1385 }
1386 
IdsToOids(int vol_start,int vol_end,CSeqDBGiList & ids)1387 void CSeqDBIsam::IdsToOids(int              vol_start,
1388                            int              vol_end,
1389                            CSeqDBGiList   & ids)
1390 
1391 
1392 {
1393     // The vol_start parameter is needed because translations in the
1394     // GI list should refer to global OIDs, not per-volume OIDs.
1395 
1396     switch (m_IdentType) {
1397     case eGiId:
1398         x_TranslateGiList<TGi>(vol_start, ids);
1399         break;
1400 
1401     case eTiId:
1402         x_TranslateGiList<TTi>(vol_start, ids);
1403         break;
1404 
1405     case eStringId:
1406         x_TranslateGiList<string>(vol_start, ids);
1407         break;
1408 
1409     case ePigId:
1410         x_TranslateGiList<TPig>(vol_start, ids);
1411         break;
1412 
1413     default:
1414         NCBI_THROW(CSeqDBException,
1415                        eArgErr,
1416                        "Error: Wrong type of idlist specified.");
1417     }
1418 }
1419 
IdsToOids(int vol_start,int vol_end,CSeqDBNegativeList & ids)1420 void CSeqDBIsam::IdsToOids(int                  vol_start,
1421                            int                  vol_end,
1422                            CSeqDBNegativeList & ids)
1423 
1424 
1425 {
1426     // The vol_start parameter is needed because translations in the
1427     // GI list should refer to global OIDs, not per-volume OIDs.
1428 
1429     _ASSERT(m_IdentType == eGiId || m_IdentType == eTiId || m_IdentType == eStringId);
1430 
1431     //m_Atlas.Lock(locked);
1432 
1433     ids.InsureOrder();
1434 
1435     if ((m_IdentType == eGiId) && ids.GetNumGis()) {
1436         x_SearchNegativeMulti(vol_start,
1437                               vol_end,
1438                               ids,
1439                               false);
1440 
1441     }
1442 
1443     if ((m_IdentType == eTiId) && ids.GetNumTis()) {
1444         x_SearchNegativeMulti(vol_start,
1445                               vol_end,
1446                               ids,
1447                               true);
1448 
1449     }
1450 
1451     if(m_IdentType == eStringId && ids.GetNumSis()) {
1452         x_SearchNegativeMultiSeq(vol_start,
1453                               vol_end,
1454                               ids);
1455                               //true,
1456 
1457     }
1458 }
1459 
x_FindIndexBounds()1460 void CSeqDBIsam::x_FindIndexBounds()
1461 {
1462     Int4 Start (0);
1463     Int4 Stop  (m_NumSamples - 1);
1464 
1465     //m_Atlas.Lock(locked);
1466 
1467 
1468     if (m_Type == eNumeric) {
1469         //
1470         // Get first key from data file
1471 
1472         int num_elements(0);
1473         int start(0);
1474         const void * data_page(0);
1475 
1476         x_MapDataPage(Start,
1477                       start,
1478                       num_elements,
1479                       & data_page);
1480 
1481 
1482         _ASSERT(num_elements);
1483 
1484         int elem_index = 0;
1485 
1486         Int8 data_gi(0);
1487         int data_oid(-1);
1488 
1489         x_GetDataElement(data_page,
1490                          elem_index,
1491                          data_gi,
1492                          data_oid);
1493 
1494         m_FirstKey.SetNumeric(data_gi);
1495 
1496 
1497         //
1498         // Get last key from data file
1499 
1500         x_MapDataPage(Stop,
1501                       start,
1502                       num_elements,
1503                       & data_page);
1504 
1505 
1506         _ASSERT(num_elements);
1507 
1508         elem_index = num_elements - 1;
1509 
1510         x_GetDataElement(data_page,
1511                          elem_index,
1512                          data_gi,
1513                          data_oid);
1514 
1515         m_LastKey.SetNumeric(data_gi);
1516     } else {
1517         //
1518         // Load the appropriate page of terms into memory.
1519 
1520         const char * beginp(0);
1521         const char * endp(0);
1522 
1523         //
1524         // Load the first page
1525 
1526         x_LoadPage(Start, Start + 1, & beginp, & endp);
1527 
1528         // Get first term
1529 
1530         vector<string> keys_out;
1531         vector<string> data_out; // not used
1532 
1533         x_ExtractData(beginp,
1534                       endp,
1535                       keys_out,
1536                       data_out);
1537 
1538         x_Lower(keys_out.front());
1539         m_FirstKey.SetString(keys_out.front());
1540 
1541 
1542         //
1543         // Load the last page
1544 
1545         x_LoadPage(Stop, Stop + 1, & beginp, & endp);
1546 
1547         // Advance to last item
1548 
1549         const char * lastp(0);
1550         const char * indexp(beginp);
1551 
1552         while (indexp < endp) {
1553             // Remember our new "last term" value.
1554 
1555             lastp = indexp;
1556 
1557             // Skip remainder of term, and any nulls after it.
1558 
1559             while((indexp < endp) && s_SeqDBIsam_NullifyEOLs(*indexp)) {
1560                 indexp++;
1561             }
1562             while((indexp < endp) && (! s_SeqDBIsam_NullifyEOLs(*indexp))) {
1563                 indexp++;
1564             }
1565         }
1566 
1567         // Get the last key
1568 
1569         _ASSERT(lastp);
1570 
1571         keys_out.clear();
1572         data_out.clear();
1573 
1574         x_ExtractData(lastp,
1575                       endp,
1576                       keys_out,
1577                       data_out);
1578 
1579         x_Lower(keys_out.front());
1580         m_LastKey.SetString(keys_out.front());
1581     }
1582 }
1583 
x_OutOfBounds(Int8 key)1584 bool CSeqDBIsam::x_OutOfBounds(Int8 key)
1585 {
1586     if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1587         return false;
1588     }
1589 
1590     _ASSERT(m_Type == eNumeric);
1591 
1592     if (m_FirstKey.OutsideFirstBound(key)) {
1593         return true;
1594     }
1595 
1596     if (m_LastKey.OutsideLastBound(key)) {
1597         return true;
1598     }
1599 
1600     return false;
1601 }
1602 
x_OutOfBounds(string key)1603 bool CSeqDBIsam::x_OutOfBounds(string key)
1604 {
1605     if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1606         return false;
1607     }
1608 
1609     _ASSERT(m_Type == eString);
1610 
1611     x_Lower(key);
1612 
1613     if (m_FirstKey.OutsideFirstBound(key)) {
1614         return true;
1615     }
1616 
1617     if (m_LastKey.OutsideLastBound(key)) {
1618         return true;
1619     }
1620 
1621     return false;
1622 }
1623 
GetIdBounds(Int8 & low_id,Int8 & high_id,int & count)1624 void CSeqDBIsam::GetIdBounds(Int8           & low_id,
1625                              Int8           & high_id,
1626                              int            & count)
1627 
1628 
1629 {
1630     if(m_Initialized == false) {
1631         count = 0;
1632         return;
1633     }
1634 
1635     if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1636         count = 0;
1637         return;
1638     }
1639 
1640     low_id = m_FirstKey.GetNumeric();
1641     high_id = m_LastKey.GetNumeric();
1642     count = m_NumTerms;
1643 }
1644 
GetIdBounds(string & low_id,string & high_id,int & count)1645 void CSeqDBIsam::GetIdBounds(string         & low_id,
1646                              string         & high_id,
1647                              int            & count)
1648 
1649 
1650 {
1651     if(m_Initialized == false) {
1652         count = 0;
1653         return;
1654     }
1655 
1656     if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1657         count = 0;
1658         return;
1659     }
1660 
1661     low_id = m_FirstKey.GetString();
1662     high_id = m_LastKey.GetString();
1663     count = m_NumTerms;
1664 }
1665 
HashToOids(unsigned hash,vector<TOid> & oids)1666 void CSeqDBIsam::HashToOids(unsigned         hash,
1667                             vector<TOid>   & oids)
1668 
1669 
1670 {
1671     _ASSERT(m_IdentType == eHashId);
1672     if(m_Initialized == false) {
1673         return;
1674     }
1675 
1676     bool found = false;
1677 
1678     string key(NStr::UIntToString(hash));
1679 
1680     EErrorCode err = eNoError;
1681 
1682     vector<string> keys_out;
1683     vector<string> data_out;
1684     vector<TIndx>  indices_out;
1685 
1686     if ((err = x_StringSearch(key,
1687                               keys_out,
1688                               data_out,
1689                               indices_out)) < 0) {
1690 
1691         return;
1692     }
1693 
1694     if (err != eNotFound) {
1695         found = true;
1696     }
1697 
1698     if (found) {
1699         ITERATE(vector<string>, iter, data_out) {
1700             oids.push_back(atoi(iter->c_str()));
1701         }
1702     }
1703 }
1704 
1705 END_NCBI_SCOPE
1706 
1707