1 /* $Id: seqdbgilistset.cpp 618513 2020-10-21 16:07:12Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file seqdbgilistset.cpp
31 /// Implementation for the CSeqDBVol class, which provides an
32 /// interface for all functionality of one database volume.
33 #include <ncbi_pch.hpp>
34 #include "seqdbgilistset.hpp"
35 #include <algorithm>
36
37 BEGIN_NCBI_SCOPE
38
39
40 /// Defines a pair of integers and a sort order.
41 ///
42 /// This struct stores a pair of integers, the volume index and the
43 /// oid count. The ordering is by the oid count, descending.
44
45 struct SSeqDB_IndexCountPair {
46 public:
47 /// Index of the volume in the volume set.
48 int m_Index;
49
50 /// Number of OIDs associated with this volume.
51 int m_Count;
52
53 /// Less than operator, where elements with larger allows sorting.
54 /// Elements are sorted by number of OIDs in descending order.
55 /// @param rhs
56 /// The right hand side of the less than. [in]
operator <SSeqDB_IndexCountPair57 bool operator < (const SSeqDB_IndexCountPair & rhs) const
58 {
59 return m_Count > rhs.m_Count;
60 }
61 };
62
63
64 /// CSeqDBNodeIdList
65 ///
66 /// This class defines a simple CSeqDBGiList subclass which is read
67 /// from a gi list file using the CSeqDBAtlas. It uses the atlas for
68 /// file access and registers the memory used by the vector with the
69 /// atlas layer.
70
71 class CSeqDBNodeFileIdList : public CSeqDBGiList {
72 public:
73 /// Build a GI,TI, or SI list from a memory mapped file.
74 ///
75 /// Given an ID list file mapped into a region of memory, this
76 /// class reads the GIs or TIs from the file.
77 ///
78 /// @param atlas
79 /// The memory management layer object. [in]
80 /// @param fname
81 /// The filename of this ID list. [in]
82 /// @param list_type
83 /// The type of ID [in]
84 /// @param locked
85 /// Lock holder object for this thread. [in]
CSeqDBNodeFileIdList(CSeqDBAtlas & atlas,const CSeqDB_Path & fname,CSeqDBGiListSet::EGiListType list_type,CSeqDBLockHold & locked)86 CSeqDBNodeFileIdList(CSeqDBAtlas & atlas,
87 const CSeqDB_Path & fname,
88 CSeqDBGiListSet::EGiListType list_type,
89 CSeqDBLockHold & locked)
90 : m_VectorMemory(atlas)
91 {
92 CSeqDBAtlas::TIndx file_size(0);
93
94 CSeqDBFileMemMap memlease(atlas,fname.GetPathS());
95 atlas.GetFileSizeL(fname.GetPathS(), file_size);
96
97 const char * fbeginp = memlease.GetFileDataPtr(0);
98
99 const char * fendp = fbeginp + file_size;
100
101 try {
102 bool in_order = false;
103
104 switch(list_type) {
105 case CSeqDBGiListSet::eGiList:
106 SeqDB_ReadMemoryGiList(fbeginp, fendp, m_GisOids, & in_order);
107 break;
108 case CSeqDBGiListSet::eTiList:
109 SeqDB_ReadMemoryTiList(fbeginp, fendp, m_TisOids, & in_order);
110 break;
111 case CSeqDBGiListSet::eSiList:
112 SeqDB_ReadMemorySiList(fbeginp, fendp, m_SisOids, & in_order);
113 break;
114 case CSeqDBGiListSet::ePigList:
115 SeqDB_ReadMemoryPigList(fbeginp, fendp, m_PigsOids, & in_order);
116 break;
117 }
118
119 if (in_order) {
120 m_CurrentOrder = eGi;
121 }
122 }
123 catch(...) {
124 memlease.Clear();
125 throw;
126 }
127
128 //memlease.Clear();
129
130 int vector_size =
131 (int(m_GisOids.size() * sizeof(m_GisOids[0])) +
132 int(m_TisOids.size() * sizeof(m_TisOids[0])));
133
134 atlas.RegisterExternal(m_VectorMemory, vector_size, locked);
135 // TODO m_VectorMemory for seqid_list
136 }
137
138 /// Destructor
~CSeqDBNodeFileIdList()139 virtual ~CSeqDBNodeFileIdList()
140 {
141 }
142
143 private:
144 /// Memory associated with the m_GisOids vector.
145 CSeqDBMemReg m_VectorMemory;
146 };
147
s_ProcessSeqIDsForV5(vector<string> & idlist)148 void s_ProcessSeqIDsForV5 (vector<string> & idlist)
149 {
150 vector<string> tmplist;
151 tmplist.reserve(idlist.size());
152 for(unsigned int i=0; i < idlist.size(); i++) {
153 try {
154 CSeq_id seqid(idlist[i], CSeq_id::fParse_RawText | CSeq_id::fParse_AnyLocal | CSeq_id::fParse_PartialOK);
155 if(seqid.IsGi()) {
156 continue;
157 }
158
159 if(seqid.IsPir() || seqid.IsPrf()) {
160 string id = seqid.AsFastaString();
161 tmplist.push_back(id);
162 continue;
163 }
164 tmplist.push_back(seqid.GetSeqIdString(true));
165 } catch (CException & e) {
166 ERR_POST(e.GetMsg());
167 NCBI_THROW(CSeqDBException, eArgErr, "Invalid seq id: " + idlist[i]);
168
169 }
170 }
171
172 if (tmplist.size() == 0) {
173 ERR_POST(Warning << "Empty seqid list");
174 }
175 else {
176 sort(tmplist.begin(), tmplist.end());
177 vector<string>::iterator it = unique (tmplist.begin(), tmplist.end());
178 tmplist.resize(distance(tmplist.begin(),it));
179 }
180 idlist.swap(tmplist);
181 }
182
s_ProcessPositiveSeqIDsForV5(CRef<CSeqDBGiList> & user_list)183 void s_ProcessPositiveSeqIDsForV5(CRef<CSeqDBGiList> & user_list)
184 {
185 SBlastSeqIdListInfo newInfo = user_list->GetListInfo();
186 newInfo.is_v4 = false;
187 user_list->SetListInfo (newInfo);
188
189 vector<string> idlist;
190 user_list->GetSiList(idlist);
191 s_ProcessSeqIDsForV5(idlist);
192 user_list->SetSiList(idlist);
193 }
194
s_ProcessNegativeSeqIDsForV5(CRef<CSeqDBNegativeList> & user_list)195 void s_ProcessNegativeSeqIDsForV5(CRef<CSeqDBNegativeList> & user_list)
196 {
197 SBlastSeqIdListInfo newInfo = user_list->GetListInfo();
198 newInfo.is_v4 = false;
199 user_list->SetListInfo (newInfo);
200
201 vector<string> idlist = user_list->GetSiList();
202 s_ProcessSeqIDsForV5 (idlist);
203 user_list->SetSiList(idlist);
204 }
205
s_VerifySeqidlist(const SBlastSeqIdListInfo & list_info,const CSeqDBVolSet & volset,const CSeqDBLMDBSet & lmdb_set)206 bool s_VerifySeqidlist(const SBlastSeqIdListInfo & list_info, const CSeqDBVolSet & volset, const CSeqDBLMDBSet & lmdb_set)
207 {
208 if(list_info.is_v4 && lmdb_set.IsBlastDBVersion5()) {
209 ERR_POST(Warning << "To obtain better run time performance, " \
210 "please run blastdb_aliastool -seqid_file_in <INPUT_FILE_NAME> " \
211 "-seqid_file_out <OUT_FILE_NAME> and use <OUT_FILE_NAME> as the argument to -seqidlist");
212 return true;
213 }
214
215 if((!list_info.is_v4) && (!lmdb_set.IsBlastDBVersion5())) {
216 NCBI_THROW(CSeqDBException, eArgErr,
217 "Seqidlist is not in BLAST db v4 format");
218 }
219
220 if((list_info.db_vol_length > 0) &&
221 (list_info.db_vol_length != volset.GetVolumeSetLength())) {
222 ERR_POST(Warning << "Seqidlist file db info does not match input db");
223 }
224
225 return false;
226 }
227
228 void
x_ResolvePositiveList(CSeqDBAtlas & atlas,const CSeqDBVolSet & volset,CRef<CSeqDBGiList> user_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)229 CSeqDBGiListSet::x_ResolvePositiveList(CSeqDBAtlas & atlas,
230 const CSeqDBVolSet & volset,
231 CRef<CSeqDBGiList> user_list,
232 CSeqDBLockHold & locked,
233 const CSeqDBLMDBSet & lmdb_set)
234 {
235 if (m_UserList.NotEmpty() && m_UserList->NotEmpty()) {
236 if(user_list->GetNumSis() > 0) {
237 if(s_VerifySeqidlist(user_list->GetListInfo(), volset, lmdb_set)) {
238 s_ProcessPositiveSeqIDsForV5(user_list);
239 }
240 }
241
242 if((user_list->GetNumTaxIds() > 0) && !(lmdb_set.IsBlastDBVersion5())) {
243 NCBI_THROW(CSeqDBException, eArgErr, "Taxonomy filtering is not supported in v4 BLAST dbs");
244 }
245 if (lmdb_set.IsBlastDBVersion5()) {
246 if(user_list->GetNumSis() > 0) {
247 vector<string> accs;
248 vector<blastdb::TOid> oids;
249 user_list->GetSiList(accs);
250 lmdb_set.AccessionsToOids(accs, oids);
251 for(unsigned int i=0; i < accs.size(); i++) {
252 user_list->SetSiTranslation(i, oids[i]);
253 }
254 }
255 if(user_list->GetNumTaxIds() > 0) {
256 vector<blastdb::TOid> & oids = user_list->SetOidsForTaxIdsList();
257 set<TTaxId> & tax_ids = user_list->GetTaxIdsList();
258 lmdb_set.TaxIdsToOids(tax_ids, oids);
259 }
260 if((user_list->GetNumGis() == 0) && (user_list->GetNumTis() == 0) &&
261 (user_list->GetNumPigs() == 0)) {
262 return;
263 }
264 }
265 if((user_list->GetNumSis() > 0) && !(lmdb_set.IsBlastDBVersion5())) {
266
267 user_list->PreprocessIdsForISAMSiLookup();
268 }
269
270 typedef SSeqDB_IndexCountPair TIndexCount;
271 vector<TIndexCount> OidsPerVolume;
272
273 // Build a list of volumes sorted by OID count.
274
275 for(int i = 0; i < volset.GetNumVols(); i++) {
276 const CSeqDBVolEntry * vol = volset.GetVolEntry(i);
277
278 TIndexCount vol_oids;
279 vol_oids.m_Index = i;
280 vol_oids.m_Count = vol->OIDEnd() - vol->OIDStart();
281
282 OidsPerVolume.push_back(vol_oids);
283 }
284
285 // The largest volumes should be used first, to minimize the
286 // number of failed GI->OID conversion attempts. Searching input
287 // GIs against larger volumes first should eliminate most of the
288 // GIs by the time smaller volumes are searched, thus reducing the
289 // total number of lookups.
290
291 std::sort(OidsPerVolume.begin(), OidsPerVolume.end());
292
293 for(int i = 0; i < (int)OidsPerVolume.size(); i++) {
294 int vol_idx = OidsPerVolume[i].m_Index;
295
296 const CSeqDBVolEntry * vol = volset.GetVolEntry(vol_idx);
297
298 // Note: The implied ISAM lookups will sort by GI/TI.
299
300 vol->Vol()->IdsToOids(*m_UserList, locked);
301 }
302 }
303 }
304
305
306 void
x_ResolveNegativeList(CSeqDBAtlas & atlas,const CSeqDBVolSet & volset,CRef<CSeqDBNegativeList> neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)307 CSeqDBGiListSet::x_ResolveNegativeList(CSeqDBAtlas & atlas,
308 const CSeqDBVolSet & volset,
309 CRef<CSeqDBNegativeList> neg_list,
310 CSeqDBLockHold & locked,
311 const CSeqDBLMDBSet & lmdb_set)
312 {
313 if (m_NegativeList.NotEmpty() && m_NegativeList->NotEmpty()) {
314 // We don't bother to sort these since every ISAM mapping must
315 // be examined for the negative ID list case.
316 if(m_NegativeList->GetNumSis() > 0) {
317 if (s_VerifySeqidlist(m_NegativeList->GetListInfo(), volset, lmdb_set)) {
318 s_ProcessNegativeSeqIDsForV5(neg_list);
319 }
320 }
321
322 if((m_NegativeList->GetNumTaxIds() > 0) && !(lmdb_set.IsBlastDBVersion5())) {
323 NCBI_THROW(CSeqDBException, eArgErr, "Taxonomy filtering is not supported in v4 BLAST dbs");
324 }
325
326 if(lmdb_set.IsBlastDBVersion5()) {
327
328 if(m_NegativeList->GetNumSis() > 0) {
329 vector<blastdb::TOid> & oids = m_NegativeList->SetExcludedOids();
330 lmdb_set.NegativeSeqIdsToOids(m_NegativeList->GetSiList(), oids);
331
332 }
333 if(m_NegativeList->GetNumTaxIds() > 0) {
334 vector<blastdb::TOid> & oids = m_NegativeList->SetExcludedOids();
335 set<TTaxId> & tax_ids = m_NegativeList->GetTaxIdsList();
336 lmdb_set.NegativeTaxIdsToOids(tax_ids, oids);
337 }
338
339 if((m_NegativeList->GetNumGis() == 0) && (m_NegativeList->GetNumTis() == 0) &&
340 (m_NegativeList->GetNumPigs() == 0)) {
341 return;
342 }
343 }
344
345 if((m_NegativeList->GetNumSis() > 0) && !(lmdb_set.IsBlastDBVersion5())) {
346 m_NegativeList->PreprocessIdsForISAMSiLookup();
347 }
348
349 if (m_NegativeList->GetNumPigs() > 0) {
350 CSeqDBGiList pigs;
351 pigs.ReservePigs(m_NegativeList->GetNumPigs());
352 ITERATE(vector<TPig>, p, m_NegativeList->GetPigList()){
353 pigs.AddPig(*p);
354 }
355 for(int v = 0; v < volset.GetNumVols(); v++) {
356 const CSeqDBVolEntry * vol = volset.GetVolEntry(v);
357 vol->Vol()->IdsToOids(pigs, locked);
358 }
359
360 vector<blastdb::TOid> & exclude_oid_list = m_NegativeList->SetExcludedOids();
361 for(int o=0; o < pigs.GetNumPigs(); o++) {
362 const CSeqDBGiList::SPigOid & pig = pigs.GetPigOid(o);
363 if(pig.oid != -1) {
364 exclude_oid_list.push_back(pig.oid);
365 }
366 }
367
368 }
369 for(int i = 0; i < volset.GetNumVols(); i++) {
370 const CSeqDBVolEntry * vol = volset.GetVolEntry(i);
371
372 // Note: The implied ISAM lookups will sort by GI/TI.
373
374 vol->Vol()->IdsToOids(*m_NegativeList, locked);
375 }
376 }
377 }
378
CSeqDBGiListSet(CSeqDBAtlas & atlas,const CSeqDBVolSet & volset,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)379 CSeqDBGiListSet::CSeqDBGiListSet(CSeqDBAtlas & atlas,
380 const CSeqDBVolSet & volset,
381 CRef<CSeqDBGiList> user_list,
382 CRef<CSeqDBNegativeList> neg_list,
383 CSeqDBLockHold & locked,
384 const CSeqDBLMDBSet & lmdb_set)
385 : m_Atlas (atlas),
386 m_UserList (user_list),
387 m_NegativeList (neg_list)
388 {
389 x_ResolvePositiveList(atlas, volset, user_list, locked, lmdb_set);
390 x_ResolveNegativeList(atlas, volset, neg_list, locked, lmdb_set);
391 }
392
393 CRef<CSeqDBGiList>
GetNodeIdList(const CSeqDB_Path & filename,const CSeqDBVol * volp,EGiListType list_type,CSeqDBLockHold & locked)394 CSeqDBGiListSet::GetNodeIdList(const CSeqDB_Path & filename,
395 const CSeqDBVol * volp,
396 EGiListType list_type,
397 CSeqDBLockHold & locked)
398 {
399 // Note: possibly the atlas should have a method to add and
400 // subtract pseudo allocations from the memory bound; this would
401 // allow GI list vectors to share the memory bound with memory
402 // mapped file ranges.
403
404 //m_Atlas.Lock(locked);
405
406 // Seperate indices are used for TIs and GIs. (Attempting to use
407 // the same file for both should also produce an error when the
408 // binary file is read, as the magic number is different.)
409
410 TNodeListMap& map_ref = (list_type == eGiList) ? m_GINodeListMap :
411 ((list_type == eTiList) ? m_TINodeListMap : m_SINodeListMap);
412 CRef<CSeqDBGiList> gilist = map_ref[filename.GetPathS()];
413
414 if (gilist.Empty()) {
415 gilist.Reset(new CSeqDBNodeFileIdList(m_Atlas,
416 filename,
417 list_type,
418 locked));
419
420 if (m_UserList.NotEmpty()) {
421 // Note: translates the GIs and TIs, but ignores Seq-ids.
422 x_TranslateFromUserList(*gilist);
423 }
424
425 map_ref[filename.GetPathS()] = gilist;
426 }
427
428 // Note: in pure-GI mode, it might be more efficient (in some
429 // cases) to translate all sub-lists, then translate the main list
430 // in terms of those, rather than the other way round. It might
431 // be a good idea to investigate this -- it should only be done if
432 // the sub-lists are (collectively) smaller than the user list.
433
434 // If there are Seq-ids, we need to translate all GI lists from
435 // the volume data, because some of the Seq-ids may refer to the
436 // same sequences as GIs in those lists. More sophisticated
437 // methods are possible; for example, we could try to convert all
438 // Seq-ids to GIs. If this worked (it would for those databases
439 // where GIs are available for all sequences), then we would be
440 // able to continue processing as if the User GI list were
441 // strictly GI based.
442
443 // The ideal solution might be to build a conceptual map of all
444 // data sources and estimate the time needed for different
445 // techniques. This has not been done.
446
447 bool mixed_ids = m_UserList.Empty() || (!! m_UserList->GetNumSis())|| (m_UserList->GetNumTaxIds());
448
449 if (! mixed_ids) {
450 if ((m_UserList->GetNumTis() && gilist->GetNumGis()) ||
451 (m_UserList->GetNumGis() && gilist->GetNumTis())) {
452
453 mixed_ids = true;
454 }
455 }
456
457 if (m_UserList.Empty() || mixed_ids) {
458 if(gilist->GetNumSis() > 0 ) {
459 gilist->PreprocessIdsForISAMSiLookup();
460 }
461 volp->IdsToOids(*gilist, locked);
462 }
463
464 // If there is a volume GI list, it will also be attached to the
465 // volume, and replaces the user GI list attachment (if there was
466 // one). There can be one user GI list or multiple volume GI
467 // lists for each volume.
468
469 volp->AttachVolumeGiList(gilist);
470
471 return gilist;
472 }
473
x_TranslateGisFromUserList(CSeqDBGiList & gilist)474 void CSeqDBGiListSet::x_TranslateGisFromUserList(CSeqDBGiList & gilist)
475 {
476 CSeqDBGiList & source = *m_UserList;
477 CSeqDBGiList & target = gilist;
478
479 source.InsureOrder(CSeqDBGiList::eGi);
480 target.InsureOrder(CSeqDBGiList::eGi);
481
482 int source_num = source.GetNumGis();
483 int target_num = target.GetNumGis();
484
485 int source_index = 0;
486 int target_index = 0;
487
488 while(source_index < source_num && target_index < target_num) {
489 TGi source_gi = source.GetGiOid(source_index).gi;
490 TGi target_gi = target.GetGiOid(target_index).gi;
491
492 // Match; translate if needed
493
494 if (source_gi == target_gi) {
495 if (target.GetGiOid(target_index).oid == -1) {
496 target.SetGiTranslation(target_index, source.GetGiOid(source_index).oid);
497 }
498 target_index++;
499 source_index++;
500 } else if (source_gi > target_gi) {
501 target_index ++;
502
503 // Search target using expanding jumps
504 int jump = 2;
505 int test = target_index + jump;
506
507 while(test < target_num && target.GetGiOid(test).gi < source_gi) {
508 target_index = test;
509 jump *= 2;
510 test = target_index + jump;
511 }
512 } else /* source_gi < target_gi */ {
513 source_index ++;
514
515 // Search source using expanding jumps
516 int jump = 2;
517 int test = source_index + jump;
518
519 while(test < source_num && source.GetGiOid(test).gi < target_gi) {
520 source_index = test;
521 jump *= 2;
522 test = source_index + jump;
523 }
524 }
525 }
526 }
527
x_TranslateTisFromUserList(CSeqDBGiList & gilist)528 void CSeqDBGiListSet::x_TranslateTisFromUserList(CSeqDBGiList & gilist)
529 {
530 CSeqDBGiList & source = *m_UserList;
531 CSeqDBGiList & target = gilist;
532
533 source.InsureOrder(CSeqDBGiList::eGi);
534 target.InsureOrder(CSeqDBGiList::eGi);
535
536 int source_num = source.GetNumTis();
537 int target_num = target.GetNumTis();
538
539 int source_index = 0;
540 int target_index = 0;
541
542 while(source_index < source_num && target_index < target_num) {
543 TTi source_ti = source.GetTiOid(source_index).ti;
544 TTi target_ti = target.GetTiOid(target_index).ti;
545
546 // Match; translate if needed
547
548 if (source_ti == target_ti) {
549 if (target.GetTiOid(target_index).oid == -1) {
550 target.SetTiTranslation(target_index,
551 source.GetTiOid(source_index).oid);
552 }
553
554 target_index++;
555 source_index++;
556 } else if (source_ti > target_ti) {
557 target_index ++;
558
559 // Search target using expanding jumps
560 int jump = 2;
561 int test = target_index + jump;
562
563 while(test < target_num &&
564 target.GetTiOid(test).ti < source_ti) {
565
566 target_index = test;
567 jump *= 2;
568 test = target_index + jump;
569 }
570 } else /* source_ti < target_ti */ {
571 source_index ++;
572
573 // Search source using expanding jumps
574 int jump = 2;
575 int test = source_index + jump;
576
577 while(test < source_num &&
578 source.GetTiOid(test).ti < target_ti) {
579
580 source_index = test;
581 jump *= 2;
582 test = source_index + jump;
583 }
584 }
585 }
586 }
587
x_TranslateFromUserList(CSeqDBGiList & gilist)588 void CSeqDBGiListSet::x_TranslateFromUserList(CSeqDBGiList & gilist)
589 {
590 x_TranslateGisFromUserList(gilist);
591 x_TranslateTisFromUserList(gilist);
592 }
593
594 END_NCBI_SCOPE
595
596