1 /* $Id: seq_id_tree.cpp 624726 2021-02-03 18:51:54Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko, Eugene Vasilchenko
27 *
28 * File Description:
29 * Seq-id mapper for Object Manager
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objects/misc/error_codes.hpp>
35 #include <corelib/ncbi_param.hpp>
36 #include "seq_id_tree.hpp"
37 #include <objects/seq/seq_id_mapper.hpp>
38 #include <common/ncbi_sanitizers.h>
39
40
41 #define NCBI_USE_ERRCODE_X Objects_SeqIdMap
42
43
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(objects)
46
47 //#define NCBI_SLOW_ATOMIC_SWAP
48 #ifdef NCBI_SLOW_ATOMIC_SWAP
49 DEFINE_STATIC_FAST_MUTEX(sx_GetSeqIdMutex);
50 #endif
51
52 ////////////////////////////////////////////////////////////////////
53 //
54 // CSeq_id_***_Tree::
55 //
56 // Seq-id sub-type specific trees
57 //
58
CSeq_id_Which_Tree(CSeq_id_Mapper * mapper)59 CSeq_id_Which_Tree::CSeq_id_Which_Tree(CSeq_id_Mapper* mapper)
60 : m_Mapper(mapper)
61 {
62 _ASSERT(mapper);
63 }
64
65
~CSeq_id_Which_Tree(void)66 CSeq_id_Which_Tree::~CSeq_id_Which_Tree(void)
67 {
68 }
69
70
HaveMatch(const CSeq_id_Handle &) const71 bool CSeq_id_Which_Tree::HaveMatch(const CSeq_id_Handle& ) const
72 {
73 return false; // Assume no matches by default
74 }
75
76
FindMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list) const77 void CSeq_id_Which_Tree::FindMatch(const CSeq_id_Handle& id,
78 TSeq_id_MatchList& id_list) const
79 {
80 id_list.insert(id); // only exact match by default
81 }
82
83
Match(const CSeq_id_Handle & h1,const CSeq_id_Handle & h2) const84 bool CSeq_id_Which_Tree::Match(const CSeq_id_Handle& h1,
85 const CSeq_id_Handle& h2) const
86 {
87 if ( h1 == h2 ) {
88 return true;
89 }
90 if ( HaveMatch(h1) ) {
91 TSeq_id_MatchList id_list;
92 FindMatch(h1, id_list);
93 return id_list.find(h2) != id_list.end();
94 }
95 return false;
96 }
97
98
IsBetterVersion(const CSeq_id_Handle &,const CSeq_id_Handle &) const99 bool CSeq_id_Which_Tree::IsBetterVersion(const CSeq_id_Handle& /*h1*/,
100 const CSeq_id_Handle& /*h2*/) const
101 {
102 return false; // No id version by default
103 }
104
105
106 inline
CreateInfo(CSeq_id::E_Choice type)107 CSeq_id_Info* CSeq_id_Which_Tree::CreateInfo(CSeq_id::E_Choice type)
108 {
109 return new CSeq_id_Info(type, m_Mapper);
110 }
111
112
HaveReverseMatch(const CSeq_id_Handle &) const113 bool CSeq_id_Which_Tree::HaveReverseMatch(const CSeq_id_Handle& ) const
114 {
115 return false; // Assume no reverse matches by default
116 }
117
118
FindReverseMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list)119 void CSeq_id_Which_Tree::FindReverseMatch(const CSeq_id_Handle& id,
120 TSeq_id_MatchList& id_list)
121 {
122 id_list.insert(id);
123 return;
124 }
125
126
s_AssignObject_id(CObject_id & new_id,const CObject_id & old_id)127 static inline void s_AssignObject_id(CObject_id& new_id,
128 const CObject_id& old_id)
129 {
130 if ( old_id.IsStr() ) {
131 new_id.SetStr(old_id.GetStr());
132 }
133 else {
134 new_id.SetId(old_id.GetId());
135 }
136 }
137
138
s_AssignDbtag(CDbtag & new_id,const CDbtag & old_id)139 static inline void s_AssignDbtag(CDbtag& new_id,
140 const CDbtag& old_id)
141 {
142 new_id.SetDb(old_id.GetDb());
143 s_AssignObject_id(new_id.SetTag(), old_id.GetTag());
144 }
145
146
s_AssignTextseq_id(CTextseq_id & new_tid,const CTextseq_id & old_tid)147 static inline void s_AssignTextseq_id(CTextseq_id& new_tid,
148 const CTextseq_id& old_tid)
149 {
150 if (old_tid.IsSetAccession()) {
151 new_tid.SetAccession(old_tid.GetAccession());
152 }
153 if (old_tid.IsSetVersion()) {
154 new_tid.SetVersion(old_tid.GetVersion());
155 }
156 if (old_tid.IsSetName()) {
157 new_tid.SetName(old_tid.GetName());
158 }
159 if (old_tid.IsSetRelease()) {
160 new_tid.SetRelease(old_tid.GetRelease());
161 }
162 }
163
164
s_AssignSeq_id(CSeq_id & new_id,const CSeq_id & old_id)165 static inline void s_AssignSeq_id(CSeq_id& new_id,
166 const CSeq_id& old_id)
167 {
168 switch (old_id.Which()) {
169 case CSeq_id::e_Gi:
170 new_id.SetGi(old_id.GetGi());
171 break;
172
173 case CSeq_id::e_Local:
174 s_AssignObject_id(new_id.SetLocal(), old_id.GetLocal());
175 break;
176
177 case CSeq_id::e_General:
178 s_AssignDbtag(new_id.SetGeneral(), old_id.GetGeneral());
179 break;
180
181 case CSeq_id::e_Other:
182 s_AssignTextseq_id(new_id.SetOther(), old_id.GetOther());
183 break;
184
185 case CSeq_id::e_Genbank:
186 s_AssignTextseq_id(new_id.SetGenbank(), old_id.GetGenbank());
187 break;
188
189 case CSeq_id::e_Embl:
190 s_AssignTextseq_id(new_id.SetEmbl(), old_id.GetEmbl());
191 break;
192
193 case CSeq_id::e_Ddbj:
194 s_AssignTextseq_id(new_id.SetDdbj(), old_id.GetDdbj());
195 break;
196
197 case CSeq_id::e_Gpipe:
198 s_AssignTextseq_id(new_id.SetGpipe(), old_id.GetGpipe());
199 break;
200
201 case CSeq_id::e_Named_annot_track:
202 s_AssignTextseq_id(new_id.SetNamed_annot_track(), old_id.GetNamed_annot_track());
203 break;
204
205 default:
206 new_id.Assign(old_id);
207 break;
208 }
209 }
210
211
CreateInfo(const CSeq_id & id)212 CSeq_id_Info* CSeq_id_Which_Tree::CreateInfo(const CSeq_id& id)
213 {
214 CRef<CSeq_id> id_ref(new CSeq_id);
215 s_AssignSeq_id(*id_ref, id);
216 return new CSeq_id_Info(id_ref, m_Mapper);
217 }
218
219
DropInfo(const CSeq_id_Info * info)220 void CSeq_id_Which_Tree::DropInfo(const CSeq_id_Info* info)
221 {
222 TWriteLockGuard guard(m_TreeLock);
223 if ( info->IsLocked() ) {
224 _ASSERT(info->m_Seq_id_Type != CSeq_id::e_not_set);
225 return;
226 }
227 if ( info->m_Seq_id_Type == CSeq_id::e_not_set ) {
228 _ASSERT(!info->IsLocked());
229 return;
230 }
231 x_Unindex(info);
232 _ASSERT(!info->IsLocked());
233 _ASSERT(info->m_Seq_id_Type != CSeq_id::e_not_set);
234 // ThreadSanitizer may report this as a race since m_Seq_id_Type
235 // may be accessed by other threads without locking the mutex.
236 // This race is safe to suppress since the object is never actually
237 // used after entering DropInfo().
238 const_cast<CSeq_id_Info*>(info)->m_Seq_id_Type = CSeq_id::e_not_set;
239 }
240
241
GetGiHandle(TGi)242 CSeq_id_Handle CSeq_id_Which_Tree::GetGiHandle(TGi /*gi*/)
243 {
244 NCBI_THROW(CSeq_id_MapperException, eTypeError, "Invalid seq-id type");
245 }
246
247
Initialize(CSeq_id_Mapper * mapper,vector<CRef<CSeq_id_Which_Tree>> & v)248 void CSeq_id_Which_Tree::Initialize(CSeq_id_Mapper* mapper,
249 vector<CRef<CSeq_id_Which_Tree> >& v)
250 {
251 NCBI_LSAN_DISABLE_GUARD;
252
253 v.resize(CSeq_id::e_MaxChoice);
254 v[CSeq_id::e_not_set].Reset(new CSeq_id_not_set_Tree(mapper));
255 v[CSeq_id::e_Local].Reset(new CSeq_id_Local_Tree(mapper));
256 v[CSeq_id::e_Gibbsq].Reset(new CSeq_id_Gibbsq_Tree(mapper));
257 v[CSeq_id::e_Gibbmt].Reset(new CSeq_id_Gibbmt_Tree(mapper));
258 v[CSeq_id::e_Giim].Reset(new CSeq_id_Giim_Tree(mapper));
259 // These three types share the same accessions space
260 CRef<CSeq_id_Which_Tree> gb(new CSeq_id_GB_Tree(mapper));
261 v[CSeq_id::e_Genbank] = gb;
262 v[CSeq_id::e_Embl] = gb;
263 v[CSeq_id::e_Ddbj] = gb;
264 v[CSeq_id::e_Pir].Reset(new CSeq_id_Pir_Tree(mapper));
265 v[CSeq_id::e_Swissprot].Reset(new CSeq_id_Swissprot_Tree(mapper));
266 v[CSeq_id::e_Patent].Reset(new CSeq_id_Patent_Tree(mapper));
267 v[CSeq_id::e_Other].Reset(new CSeq_id_Other_Tree(mapper));
268 v[CSeq_id::e_General].Reset(new CSeq_id_General_Tree(mapper));
269 v[CSeq_id::e_Gi].Reset(new CSeq_id_Gi_Tree(mapper));
270 // see above v[CSeq_id::e_Ddbj] = gb;
271 v[CSeq_id::e_Prf].Reset(new CSeq_id_Prf_Tree(mapper));
272 v[CSeq_id::e_Pdb].Reset(new CSeq_id_PDB_Tree(mapper));
273 v[CSeq_id::e_Tpg].Reset(new CSeq_id_Tpg_Tree(mapper));
274 v[CSeq_id::e_Tpe].Reset(new CSeq_id_Tpe_Tree(mapper));
275 v[CSeq_id::e_Tpd].Reset(new CSeq_id_Tpd_Tree(mapper));
276 v[CSeq_id::e_Gpipe].Reset(new CSeq_id_Gpipe_Tree(mapper));
277 v[CSeq_id::e_Named_annot_track].Reset(new CSeq_id_Named_annot_track_Tree(mapper));
278 }
279
280
281 static const size_t kMallocOverhead = 2*sizeof(void*);
282
sx_StringMemory(const string & s)283 static size_t sx_StringMemory(const string& s)
284 {
285 size_t size = s.capacity();
286 if ( size ) {
287 if ( size + sizeof(void*) > sizeof(string) ) {
288 // ref-counted
289 size += sizeof(void*) + kMallocOverhead;
290 }
291 }
292 return size;
293 }
294
295
296 /////////////////////////////////////////////////////////////////////////////
297 // CSeq_id_not_set_Tree
298 /////////////////////////////////////////////////////////////////////////////
299
CSeq_id_not_set_Tree(CSeq_id_Mapper * mapper)300 CSeq_id_not_set_Tree::CSeq_id_not_set_Tree(CSeq_id_Mapper* mapper)
301 : CSeq_id_Which_Tree(mapper)
302 {
303 }
304
305
~CSeq_id_not_set_Tree(void)306 CSeq_id_not_set_Tree::~CSeq_id_not_set_Tree(void)
307 {
308 }
309
310
Empty(void) const311 bool CSeq_id_not_set_Tree::Empty(void) const
312 {
313 return true;
314 }
315
316
317 inline
x_Check(const CSeq_id & id) const318 bool CSeq_id_not_set_Tree::x_Check(const CSeq_id& id) const
319 {
320 return id.Which() == CSeq_id::e_not_set;
321 }
322
323
DropInfo(const CSeq_id_Info *)324 void CSeq_id_not_set_Tree::DropInfo(const CSeq_id_Info* /*info*/)
325 {
326 }
327
328
x_Unindex(const CSeq_id_Info *)329 void CSeq_id_not_set_Tree::x_Unindex(const CSeq_id_Info* /*info*/)
330 {
331 }
332
333
FindInfo(const CSeq_id &) const334 CSeq_id_Handle CSeq_id_not_set_Tree::FindInfo(const CSeq_id& /*id*/) const
335 {
336 return null;
337 }
338
339
FindOrCreate(const CSeq_id &)340 CSeq_id_Handle CSeq_id_not_set_Tree::FindOrCreate(const CSeq_id& /*id*/)
341 {
342 return null;
343 }
344
345
FindMatch(const CSeq_id_Handle &,TSeq_id_MatchList &) const346 void CSeq_id_not_set_Tree::FindMatch(const CSeq_id_Handle& /*id*/,
347 TSeq_id_MatchList& /*id_list*/) const
348 {
349 ERR_POST_X(3, Warning << "CSeq_id_Mapper::GetMatchingHandles() -- "
350 "uninitialized seq-id");
351 }
352
353
FindMatchStr(const string &,TSeq_id_MatchList &) const354 void CSeq_id_not_set_Tree::FindMatchStr(const string& /*sid*/,
355 TSeq_id_MatchList& /*id_list*/) const
356 {
357 }
358
359
FindReverseMatch(const CSeq_id_Handle &,TSeq_id_MatchList &)360 void CSeq_id_not_set_Tree::FindReverseMatch(const CSeq_id_Handle& /*id*/,
361 TSeq_id_MatchList& /*id_list*/)
362 {
363 ERR_POST_X(4, Warning << "CSeq_id_Mapper::GetReverseMatchingHandles() -- "
364 "uninitialized seq-id");
365 }
366
367
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const368 size_t CSeq_id_not_set_Tree::Dump(CNcbiOstream& out,
369 CSeq_id::E_Choice type,
370 int details) const
371 {
372 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
373 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
374 out << "virtual, no memory" << endl;
375 }
376 return 0;
377 }
378
379
380 /////////////////////////////////////////////////////////////////////////////
381 // CSeq_id_int_Tree
382 /////////////////////////////////////////////////////////////////////////////
383
384
CSeq_id_int_Tree(CSeq_id_Mapper * mapper)385 CSeq_id_int_Tree::CSeq_id_int_Tree(CSeq_id_Mapper* mapper)
386 : CSeq_id_Which_Tree(mapper)
387 {
388 }
389
390
~CSeq_id_int_Tree(void)391 CSeq_id_int_Tree::~CSeq_id_int_Tree(void)
392 {
393 }
394
395
Empty(void) const396 bool CSeq_id_int_Tree::Empty(void) const
397 {
398 return m_IntMap.empty();
399 }
400
401
FindInfo(const CSeq_id & id) const402 CSeq_id_Handle CSeq_id_int_Tree::FindInfo(const CSeq_id& id) const
403 {
404 _ASSERT(x_Check(id));
405 TPacked value = x_Get(id);
406
407 TReadLockGuard guard(m_TreeLock);
408 TIntMap::const_iterator it = m_IntMap.find(value);
409 if (it != m_IntMap.end()) {
410 return CSeq_id_Handle(it->second);
411 }
412 return null;
413 }
414
415
FindOrCreate(const CSeq_id & id)416 CSeq_id_Handle CSeq_id_int_Tree::FindOrCreate(const CSeq_id& id)
417 {
418 _ASSERT(x_Check(id));
419 TPacked value = x_Get(id);
420
421 TWriteLockGuard guard(m_TreeLock);
422 pair<TIntMap::iterator, bool> ins =
423 m_IntMap.insert(TIntMap::value_type(value, nullptr));
424 if ( ins.second ) {
425 ins.first->second = CreateInfo(id);
426 }
427 return CSeq_id_Handle(ins.first->second);
428 }
429
430
x_Unindex(const CSeq_id_Info * info)431 void CSeq_id_int_Tree::x_Unindex(const CSeq_id_Info* info)
432 {
433 _ASSERT(x_Check(*info->GetSeqId()));
434 TPacked value = x_Get(*info->GetSeqId());
435
436 _VERIFY(m_IntMap.erase(value));
437 }
438
439
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const440 void CSeq_id_int_Tree::FindMatchStr(const string& sid,
441 TSeq_id_MatchList& id_list) const
442 {
443 TPacked value;
444 try {
445 value = NStr::StringToNumeric<TPacked>(sid);
446 }
447 catch (const CStringException& /*ignored*/) {
448 // Not an integer value
449 return;
450 }
451 TReadLockGuard guard(m_TreeLock);
452 TIntMap::const_iterator it = m_IntMap.find(value);
453 if (it != m_IntMap.end()) {
454 id_list.insert(CSeq_id_Handle(it->second));
455 }
456 }
457
458
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const459 size_t CSeq_id_int_Tree::Dump(CNcbiOstream& out,
460 CSeq_id::E_Choice type,
461 int details) const
462 {
463 size_t total_bytes = 0;
464 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
465 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
466 }
467 size_t count = m_IntMap.size(), elem_size = 0, extra_size = 0;
468 if ( count ) {
469 elem_size = sizeof(int)+sizeof(void*); // map value
470 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree overhead
471 elem_size += sizeof(CSeq_id_Info); //
472 elem_size += sizeof(CSeq_id); //
473 // malloc overhead:
474 // map value, CSeq_id_Info, CSeq_id
475 elem_size += 3*kMallocOverhead;
476 }
477 size_t bytes = count*elem_size+extra_size;
478 total_bytes += bytes;
479 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
480 out << count << " handles, "<<bytes<<" bytes" << endl;
481 }
482 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
483 ITERATE ( TIntMap, it, m_IntMap ) {
484 out << " " << it->second->GetSeqId()->AsFastaString() << endl;
485 }
486 }
487 return total_bytes;
488 }
489
490 /////////////////////////////////////////////////////////////////////////////
491 // CSeq_id_Gibbsq_Tree
492 /////////////////////////////////////////////////////////////////////////////
493
CSeq_id_Gibbsq_Tree(CSeq_id_Mapper * mapper)494 CSeq_id_Gibbsq_Tree::CSeq_id_Gibbsq_Tree(CSeq_id_Mapper* mapper)
495 : CSeq_id_int_Tree(mapper)
496 {
497 }
498
499
x_Check(const CSeq_id & id) const500 bool CSeq_id_Gibbsq_Tree::x_Check(const CSeq_id& id) const
501 {
502 return id.IsGibbsq();
503 }
504
505
x_Get(const CSeq_id & id) const506 CSeq_id_Gibbsq_Tree::TPacked CSeq_id_Gibbsq_Tree::x_Get(const CSeq_id& id) const
507 {
508 return INT_ID_FROM(CSeq_id::TGibbsq, id.GetGibbsq());
509 }
510
511
512 /////////////////////////////////////////////////////////////////////////////
513 // CSeq_id_Gibbmt_Tree
514 /////////////////////////////////////////////////////////////////////////////
515
CSeq_id_Gibbmt_Tree(CSeq_id_Mapper * mapper)516 CSeq_id_Gibbmt_Tree::CSeq_id_Gibbmt_Tree(CSeq_id_Mapper* mapper)
517 : CSeq_id_int_Tree(mapper)
518 {
519 }
520
521
x_Check(const CSeq_id & id) const522 bool CSeq_id_Gibbmt_Tree::x_Check(const CSeq_id& id) const
523 {
524 return id.IsGibbmt();
525 }
526
527
x_Get(const CSeq_id & id) const528 CSeq_id_Gibbmt_Tree::TPacked CSeq_id_Gibbmt_Tree::x_Get(const CSeq_id& id) const
529 {
530 return INT_ID_FROM(CSeq_id::TGibbmt, id.GetGibbmt());
531 }
532
533
534 /////////////////////////////////////////////////////////////////////////////
535 // CSeq_id_Gi_Tree
536 /////////////////////////////////////////////////////////////////////////////
537
538
CSeq_id_Gi_Info(CSeq_id_Mapper * mapper)539 CSeq_id_Gi_Info::CSeq_id_Gi_Info(CSeq_id_Mapper* mapper)
540 : CSeq_id_Info(CSeq_id::e_Gi, mapper)
541 {
542 }
543
544
GetPackedSeqId(TPacked gi,TVariant) const545 CConstRef<CSeq_id> CSeq_id_Gi_Info::GetPackedSeqId(TPacked gi, TVariant /*variant*/) const
546 {
547 CConstRef<CSeq_id> ret;
548 typedef CSeq_id_Gi_Info TThis;
549 #if defined NCBI_SLOW_ATOMIC_SWAP
550 CFastMutexGuard guard(sx_GetSeqIdMutex);
551 ret = m_Seq_id;
552 const_cast<TThis*>(this)->m_Seq_id.Reset();
553 if ( !ret || !ret->ReferencedOnlyOnce() ) {
554 ret.Reset(new CSeq_id);
555 }
556 const_cast<TThis*>(this)->m_Seq_id = ret;
557 #else
558 const_cast<TThis*>(this)->m_Seq_id.AtomicReleaseTo(ret);
559 if ( !ret || !ret->ReferencedOnlyOnce() ) {
560 ret.Reset(new CSeq_id);
561 }
562 const_cast<TThis*>(this)->m_Seq_id.AtomicResetFrom(ret);
563 #endif
564 const_cast<CSeq_id&>(*ret).SetGi(GI_FROM(TPacked, gi));
565 return ret;
566 }
567
568
CSeq_id_Gi_Tree(CSeq_id_Mapper * mapper)569 CSeq_id_Gi_Tree::CSeq_id_Gi_Tree(CSeq_id_Mapper* mapper)
570 : CSeq_id_Which_Tree(mapper),
571 m_ZeroInfo(0),
572 m_SharedInfo(0)
573 {
574 }
575
576
~CSeq_id_Gi_Tree(void)577 CSeq_id_Gi_Tree::~CSeq_id_Gi_Tree(void)
578 {
579 }
580
581
Empty(void) const582 bool CSeq_id_Gi_Tree::Empty(void) const
583 {
584 return true;
585 }
586
587
588 inline
x_Check(const CSeq_id & id) const589 bool CSeq_id_Gi_Tree::x_Check(const CSeq_id& id) const
590 {
591 return id.IsGi();
592 }
593
594
595 inline
x_Get(const CSeq_id & id) const596 TGi CSeq_id_Gi_Tree::x_Get(const CSeq_id& id) const
597 {
598 return id.GetGi();
599 }
600
601
x_Unindex(const CSeq_id_Info * info)602 void CSeq_id_Gi_Tree::x_Unindex(const CSeq_id_Info* info)
603 {
604 if ( info == m_SharedInfo ) {
605 m_SharedInfo = 0;
606 }
607 else if ( info == m_ZeroInfo ) {
608 m_ZeroInfo = 0;
609 }
610 }
611
612
GetGiHandle(TGi gi)613 CSeq_id_Handle CSeq_id_Gi_Tree::GetGiHandle(TGi gi)
614 {
615 if ( gi != ZERO_GI ) {
616 TWriteLockGuard guard(m_TreeLock);
617 if ( !m_SharedInfo ) {
618 m_SharedInfo = new CSeq_id_Gi_Info(m_Mapper);
619 }
620 return CSeq_id_Handle(m_SharedInfo, GI_TO(TPacked, gi));
621 }
622 else {
623 TWriteLockGuard guard(m_TreeLock);
624 if ( !m_ZeroInfo ) {
625 CRef<CSeq_id> zero_id(new CSeq_id);
626 zero_id->SetGi(ZERO_GI);
627 m_ZeroInfo = CreateInfo(*zero_id);
628 }
629 return CSeq_id_Handle(m_ZeroInfo);
630 }
631 }
632
633
FindInfo(const CSeq_id & id) const634 CSeq_id_Handle CSeq_id_Gi_Tree::FindInfo(const CSeq_id& id) const
635 {
636 CSeq_id_Handle ret;
637 _ASSERT(x_Check(id));
638 TPacked gi = GI_TO(TPacked, x_Get(id));
639 TReadLockGuard guard(m_TreeLock);
640 if ( gi ) {
641 if ( m_SharedInfo ) {
642 ret = CSeq_id_Handle(m_SharedInfo, gi);
643 }
644 }
645 else if ( m_ZeroInfo ) {
646 ret = CSeq_id_Handle(m_ZeroInfo);
647 }
648 return ret;
649 }
650
651
FindOrCreate(const CSeq_id & id)652 CSeq_id_Handle CSeq_id_Gi_Tree::FindOrCreate(const CSeq_id& id)
653 {
654 _ASSERT(x_Check(id));
655 return GetGiHandle(x_Get(id));
656 }
657
658
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const659 void CSeq_id_Gi_Tree::FindMatchStr(const string& sid,
660 TSeq_id_MatchList& id_list) const
661 {
662 TPacked gi;
663 try {
664 gi = NStr::StringToNumeric<TPacked>(sid);
665 }
666 catch (const CStringException& /*ignored*/) {
667 // Not an integer value
668 return;
669 }
670 if (gi) {
671 id_list.insert(CSeq_id_Handle(m_SharedInfo, gi));
672 }
673 else if ( m_ZeroInfo ) {
674 id_list.insert(CSeq_id_Handle(m_ZeroInfo));
675 }
676 }
677
678
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const679 size_t CSeq_id_Gi_Tree::Dump(CNcbiOstream& out,
680 CSeq_id::E_Choice type,
681 int details) const
682 {
683 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
684 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
685 out << "virtual, small constant memory";
686 out << endl;
687 }
688 return 0;
689 }
690
691 /////////////////////////////////////////////////////////////////////////////
692 // CSeq_id_Textseq_Tree
693 /////////////////////////////////////////////////////////////////////////////
694
695
696 NCBI_PARAM_DECL(bool, OBJECTS, PACK_TEXTID);
697 NCBI_PARAM_DEF_EX(bool, OBJECTS, PACK_TEXTID, true,
698 eParam_NoThread, OBJECTS_PACK_TEXTID);
s_PackTextidEnabled(void)699 static inline bool s_PackTextidEnabled(void)
700 {
701 static CSafeStatic<NCBI_PARAM_TYPE(OBJECTS, PACK_TEXTID)> value;
702 return value->Get();
703 }
704
705 NCBI_PARAM_DECL(bool, OBJECTS, PACK_GENERAL);
706 NCBI_PARAM_DEF_EX(bool, OBJECTS, PACK_GENERAL, true,
707 eParam_NoThread, OBJECTS_PACK_GENERAL);
s_PackGeneralEnabled(void)708 static inline bool s_PackGeneralEnabled(void)
709 {
710 static CSafeStatic<NCBI_PARAM_TYPE(OBJECTS, PACK_GENERAL)> value;
711 return value->Get();
712 }
713
714 static inline
s_RestoreNumber(string & str,size_t pos,size_t len,TIntId number)715 void s_RestoreNumber(string& str, size_t pos, size_t len, TIntId number)
716 {
717 char* start = &str[pos];
718 char* ptr = start + len;
719 while ( number ) {
720 *--ptr = (char)('0' + number % 10);
721 number /= 10;
722 }
723 while ( ptr > start ) {
724 *--ptr = '0';
725 }
726 }
727
728 static inline
s_ParseNumber(const string & str,size_t pos,size_t len)729 TIntId s_ParseNumber(const string& str, size_t pos, size_t len)
730 {
731 TIntId number = 0;
732 for ( size_t i = pos; i < pos+len; ++i ) {
733 number = number * 10 + (str[i]-'0');
734 }
735 return number;
736 }
737
738
739 static inline
s_RestoreCaseVariant(string & str,size_t len,CSeq_id_Handle::TVariant variant)740 CSeq_id_Handle::TVariant s_RestoreCaseVariant(string& str, size_t len,
741 CSeq_id_Handle::TVariant variant)
742 {
743 for ( size_t i = 0; variant && i != len; ++i ) {
744 int c = Uint1(str[i]);
745 if ( isalpha(c) ) {
746 if ( variant & 1 ) {
747 // flip case
748 if ( islower(c) ) {
749 c = toupper(c);
750 }
751 else {
752 c = tolower(c);
753 }
754 str[i] = c;
755 }
756 variant >>= 1;
757 }
758 }
759 return variant;
760 }
761
762
763 static inline
s_RestoreCaseVariant(string & str,CSeq_id_Handle::TVariant variant)764 CSeq_id_Handle::TVariant s_RestoreCaseVariant(string& str, CSeq_id_Handle::TVariant variant)
765 {
766 return s_RestoreCaseVariant(str, str.size(), variant);
767 }
768
769
770 static inline
771 pair<CSeq_id_Handle::TVariant, CSeq_id_Handle::TVariant>
s_ParseCaseVariant(CTempString ref,const char * str,CSeq_id_Handle::TVariant bit)772 s_ParseCaseVariant(CTempString ref, const char* str,
773 CSeq_id_Handle::TVariant bit)
774 {
775 CSeq_id_Handle::TVariant variant = 0;
776 for ( size_t i = 0; bit && i != ref.size(); ++i ) {
777 int cr = Uint1(ref[i]);
778 if ( !isalpha(cr) ) {
779 continue;
780 }
781 int cs = Uint1(str[i]);
782 if ( cs != cr ) {
783 _ASSERT((isupper(cs) && tolower(cs) == cr) ||
784 (islower(cs) && toupper(cs) == cr));
785 variant |= bit;
786 }
787 bit <<= 1;
788 }
789 return make_pair(variant, bit);
790 }
791
792
793 static inline
794 pair<CSeq_id_Handle::TVariant, CSeq_id_Handle::TVariant>
s_ParseCaseVariant(CTempString ref,const string & str,CSeq_id_Handle::TVariant bit=1)795 s_ParseCaseVariant(CTempString ref, const string& str,
796 CSeq_id_Handle::TVariant bit = 1)
797 {
798 _ASSERT(ref.size() <= str.size());
799 return s_ParseCaseVariant(ref, str.data(), bit);
800 }
801
802
803 static inline
s_RestoreNumberAndCaseVariant(string & str,size_t pos,size_t len,TIntId number,CSeq_id_Handle::TVariant variant)804 void s_RestoreNumberAndCaseVariant(string& str, size_t pos, size_t len, TIntId number,
805 CSeq_id_Handle::TVariant variant)
806 {
807 s_RestoreNumber(str, pos, len, number);
808 if ( variant ) {
809 s_RestoreCaseVariant(str, pos, variant);
810 }
811 }
812
813
CSeq_id_Textseq_Info(CSeq_id::E_Choice type,CSeq_id_Mapper * mapper,const TKey & key)814 CSeq_id_Textseq_Info::CSeq_id_Textseq_Info(CSeq_id::E_Choice type,
815 CSeq_id_Mapper* mapper,
816 const TKey& key)
817 : CSeq_id_Info(type, mapper),
818 m_Key(key)
819 {
820 }
821
822
~CSeq_id_Textseq_Info(void)823 CSeq_id_Textseq_Info::~CSeq_id_Textseq_Info(void)
824 {
825 }
826
827
828 CSeq_id_Textseq_Info::TKey
ParseAcc(const string & acc,const TVersion * ver)829 CSeq_id_Textseq_Info::ParseAcc(const string& acc,
830 const TVersion* ver)
831 {
832 TKey key;
833 size_t len = acc.size(), prefix_len = len, most_significant = NPOS;
834 while ( prefix_len ) {
835 char c = acc[--prefix_len];
836 if ( c >= '1' && c <= '9' ) {
837 most_significant = prefix_len;
838 }
839 else if ( c != '0' ) {
840 ++prefix_len;
841 break;
842 }
843 }
844 if ( most_significant == NPOS ) {
845 return key;
846 }
847 size_t acc_digits = len - prefix_len, real_digits = len - most_significant;
848 if ( acc_digits < 2 || acc_digits > 12 ||
849 real_digits > 9 || acc_digits*2 < prefix_len ) {
850 return key;
851 }
852 if ( prefix_len <= 4 ) {
853 // good
854 }
855 else if ( prefix_len == 3 ) {
856 if ( (acc[0] != 'N' && acc[0] != 'Y') ||
857 (acc[1] != 'P' && acc[1] != 'C') ||
858 (acc[2] != '_') ) {
859 return key;
860 }
861 }
862 else {
863 return key;
864 }
865 if ( acc_digits > 6 && real_digits < acc_digits ) {
866 acc_digits = max(size_t(6), real_digits);
867 prefix_len = len - acc_digits;
868 }
869 if ( prefix_len > key.kMaxPrefixLen ) {
870 return key;
871 }
872 key.m_PrefixLen = prefix_len;
873 memcpy(key.m_PrefixBuf, acc.data(), prefix_len);
874 unsigned hash = 0;
875 for ( size_t i = 0; i < 3 && i < prefix_len; ++i ) {
876 hash = (hash << 8) | toupper(key.m_PrefixBuf[i] & 0xff);
877 }
878 hash = (hash << 8) | unsigned(acc_digits << 1);
879 key.m_Hash = hash;
880 if ( ver ) {
881 key.SetVersion(*ver);
882 }
883 return key;
884 }
885
886
RestoreAccession(string & acc,TPacked param,TVariant variant) const887 void CSeq_id_Textseq_Info::RestoreAccession(string& acc, TPacked param, TVariant variant) const
888 {
889 acc = GetAccPrefix();
890 acc.resize(acc.size() + GetAccDigits(), '0');
891 s_RestoreNumberAndCaseVariant(acc, GetAccPrefix().size(), GetAccDigits(), param, variant);
892 }
893
894
Restore(CTextseq_id & id,TPacked param,TVariant variant) const895 void CSeq_id_Textseq_Info::Restore(CTextseq_id& id, TPacked param, TVariant variant) const
896 {
897 if ( !id.IsSetAccession() ) {
898 id.SetAccession(GetAccPrefix());
899 string& acc = id.SetAccession();
900 acc.resize(acc.size() + GetAccDigits(), '0');
901 if ( IsSetVersion() ) {
902 id.SetVersion(GetVersion());
903 }
904 }
905 s_RestoreNumberAndCaseVariant(id.SetAccession(),
906 GetAccPrefix().size(), GetAccDigits(), param, variant);
907 }
908
909
910 inline
911 CSeq_id_Textseq_Info::TPacked
Pack(const TKey & key,const string & acc)912 CSeq_id_Textseq_Info::Pack(const TKey& key, const string& acc)
913 {
914 return s_ParseNumber(acc, key.GetPrefixLen(), key.GetAccDigits());
915 }
916
917
918 inline
919 CSeq_id_Textseq_Info::TPacked
Pack(const TKey & key,const CTextseq_id & tid)920 CSeq_id_Textseq_Info::Pack(const TKey& key, const CTextseq_id& tid)
921 {
922 return Pack(key, tid.GetAccession());
923 }
924
925
926 inline
927 CSeq_id_Info::TVariant
ParseCaseVariant(const CSeq_id_Info * info,const string & acc)928 CSeq_id_Textseq_Info::ParseCaseVariant(const CSeq_id_Info* info, const string& acc)
929 {
930 return s_ParseCaseVariant(info->GetSeqId()->GetTextseq_Id()->GetAccession(), acc).first;
931 }
932
933
934 inline
935 CSeq_id_Info::TVariant
ParseCaseVariant(const string & acc) const936 CSeq_id_Textseq_Info::TKey::ParseCaseVariant(const string& acc) const
937 {
938 return s_ParseCaseVariant(GetAccPrefix(), acc).first;
939 }
940
941
GetPackedSeqId(TPacked param,TVariant variant) const942 CConstRef<CSeq_id> CSeq_id_Textseq_Info::GetPackedSeqId(TPacked param, TVariant variant) const
943 {
944 CConstRef<CSeq_id> ret;
945 typedef CSeq_id_Textseq_Info TThis;
946 if ( variant ) {
947 // all non-initial case variants need fresh Seq-id to start with
948 ret = new CSeq_id;
949 }
950 else {
951 // otherwise try to use shared Seq-id if it's not referenced anywhere else
952 #if defined NCBI_SLOW_ATOMIC_SWAP
953 CFastMutexGuard guard(sx_GetSeqIdMutex);
954 ret = m_Seq_id;
955 const_cast<TThis*>(this)->m_Seq_id.Reset();
956 if ( !ret || !ret->ReferencedOnlyOnce() ) {
957 ret.Reset(new CSeq_id);
958 }
959 const_cast<TThis*>(this)->m_Seq_id = ret;
960 #else
961 const_cast<TThis*>(this)->m_Seq_id.AtomicReleaseTo(ret);
962 if ( !ret || !ret->ReferencedOnlyOnce() ) {
963 ret.Reset(new CSeq_id);
964 }
965 const_cast<TThis*>(this)->m_Seq_id.AtomicResetFrom(ret);
966 #endif
967 }
968 // split accession number and version
969 const_cast<CSeq_id&>(*ret).Select(GetType(), eDoNotResetVariant);
970 Restore(*const_cast<CTextseq_id*>(ret->GetTextseq_Id()), param, variant);
971 return ret;
972 }
973
974
CSeq_id_Textseq_PlainInfo(const CConstRef<CSeq_id> & seq_id,CSeq_id_Mapper * mapper)975 CSeq_id_Textseq_PlainInfo::CSeq_id_Textseq_PlainInfo(const CConstRef<CSeq_id>& seq_id,
976 CSeq_id_Mapper* mapper)
977 : CSeq_id_Info(seq_id, mapper)
978 {
979 }
980
981
982 inline
983 CSeq_id_Info::TVariant
ParseCaseVariant(const string & acc) const984 CSeq_id_Textseq_PlainInfo::ParseCaseVariant(const string& acc) const
985 {
986 return s_ParseCaseVariant(m_Seq_id->GetTextseq_Id()->GetAccession(), acc).first;
987 }
988
989
990 inline
991 CSeq_id_Info::TVariant
ParseCaseVariant(const CTextseq_id & id) const992 CSeq_id_Textseq_PlainInfo::ParseCaseVariant(const CTextseq_id& id) const
993 {
994 if ( !id.IsSetAccession() ) {
995 return 0;
996 }
997 return s_ParseCaseVariant(m_Seq_id->GetTextseq_Id()->GetAccession(), id.GetAccession()).first;
998 }
999
1000
GetPackedSeqId(TPacked packed,TVariant variant) const1001 CConstRef<CSeq_id> CSeq_id_Textseq_PlainInfo::GetPackedSeqId(TPacked packed, TVariant variant) const
1002 {
1003 _ASSERT(!packed);
1004 _ASSERT(variant);
1005 CRef<CSeq_id> ret(new CSeq_id);
1006 s_AssignSeq_id(*ret, *m_Seq_id);
1007 s_RestoreCaseVariant(const_cast<CTextseq_id*>(ret->GetTextseq_Id())->SetAccession(), variant);
1008 return ret;
1009 }
1010
1011
CSeq_id_Textseq_Tree(CSeq_id_Mapper * mapper,CSeq_id::E_Choice type)1012 CSeq_id_Textseq_Tree::CSeq_id_Textseq_Tree(CSeq_id_Mapper* mapper,
1013 CSeq_id::E_Choice type)
1014 : CSeq_id_Which_Tree(mapper),
1015 m_Type(type)
1016 {
1017 }
1018
1019
~CSeq_id_Textseq_Tree(void)1020 CSeq_id_Textseq_Tree::~CSeq_id_Textseq_Tree(void)
1021 {
1022 }
1023
1024
x_Check(const CSeq_id::E_Choice & type) const1025 bool CSeq_id_Textseq_Tree::x_Check(const CSeq_id::E_Choice& type) const
1026 {
1027 return type == m_Type;
1028 }
1029
1030
x_Check(const CSeq_id & id) const1031 bool CSeq_id_Textseq_Tree::x_Check(const CSeq_id& id) const
1032 {
1033 return x_Check(id.Which());
1034 }
1035
1036
Empty(void) const1037 bool CSeq_id_Textseq_Tree::Empty(void) const
1038 {
1039 return m_ByName.empty() && m_ByAcc.empty() && m_PackedMap.empty();
1040 }
1041
1042
x_Equals(const CTextseq_id & id1,const CTextseq_id & id2)1043 bool CSeq_id_Textseq_Tree::x_Equals(const CTextseq_id& id1,
1044 const CTextseq_id& id2)
1045 {
1046 if ( id1.IsSetAccession() != id2.IsSetAccession() ) {
1047 return false;
1048 }
1049 if ( id1.IsSetName() != id2.IsSetName() ) {
1050 return false;
1051 }
1052 if ( id1.IsSetVersion() != id2.IsSetVersion() ) {
1053 return false;
1054 }
1055 if ( id1.IsSetRelease() != id2.IsSetRelease() ) {
1056 return false;
1057 }
1058 if ( id1.IsSetAccession() &&
1059 !NStr::EqualNocase(id1.GetAccession(), id2.GetAccession()) ) {
1060 return false;
1061 }
1062 if ( id1.IsSetName() &&
1063 !NStr::EqualNocase(id1.GetName(), id2.GetName()) ) {
1064 return false;
1065 }
1066 if ( id1.IsSetVersion() &&
1067 id1.GetVersion() != id2.GetVersion() ) {
1068 return false;
1069 }
1070 if ( id1.IsSetRelease() &&
1071 id1.GetRelease() != id2.GetRelease() ) {
1072 return false;
1073 }
1074 return true;
1075 }
1076
1077
1078 CSeq_id_Textseq_PlainInfo*
x_FindStrInfo(const TStringMap & str_map,const string & str,CSeq_id::E_Choice type,const CTextseq_id & tid) const1079 CSeq_id_Textseq_Tree::x_FindStrInfo(const TStringMap& str_map,
1080 const string& str,
1081 CSeq_id::E_Choice type,
1082 const CTextseq_id& tid) const
1083 {
1084 for ( TStringMapCI vit = str_map.find(str);
1085 vit != str_map.end() && NStr::EqualNocase(vit->first, str);
1086 ++vit ) {
1087 CConstRef<CSeq_id> id = vit->second->GetSeqId();
1088 if ( id->Which() == type && x_Equals(tid, x_Get(*id)) ) {
1089 return vit->second;
1090 }
1091 }
1092 return 0;
1093 }
1094
1095
1096 inline
1097 CSeq_id_Textseq_PlainInfo*
x_FindStrInfo(CSeq_id::E_Choice type,const CTextseq_id & tid) const1098 CSeq_id_Textseq_Tree::x_FindStrInfo(CSeq_id::E_Choice type,
1099 const CTextseq_id& tid) const
1100 {
1101 if ( tid.IsSetAccession() ) {
1102 return x_FindStrInfo(m_ByAcc, tid.GetAccession(), type, tid);
1103 }
1104 else if ( tid.IsSetName() ) {
1105 return x_FindStrInfo(m_ByName, tid.GetName(), type, tid);
1106 }
1107 else {
1108 return 0;
1109 }
1110 }
1111
1112
FindInfo(const CSeq_id & id) const1113 CSeq_id_Handle CSeq_id_Textseq_Tree::FindInfo(const CSeq_id& id) const
1114 {
1115 // Note: if a record is found by accession, no name is checked
1116 // even if it is also set.
1117 _ASSERT(x_Check(id));
1118 const CTextseq_id& tid = x_Get(id);
1119 // Can not compare if no accession given
1120 if ( s_PackTextidEnabled() &&
1121 tid.IsSetAccession() && !tid.IsSetName() && !tid.IsSetRelease() ) {
1122 const string& acc = tid.GetAccession();
1123 TPackedKey key = CSeq_id_Textseq_Info::ParseAcc(acc, tid);
1124 if ( key ) {
1125 TPacked packed = CSeq_id_Textseq_Info::Pack(key, tid);
1126 TReadLockGuard guard(m_TreeLock);
1127 TPackedMap_CI it = m_PackedMap.find(key);
1128 if ( it == m_PackedMap.end() ) {
1129 return null;
1130 }
1131 return CSeq_id_Handle(it->second, packed, it->first.ParseCaseVariant(acc));
1132 }
1133 }
1134 TReadLockGuard guard(m_TreeLock);
1135 CSeq_id_Textseq_PlainInfo* info = x_FindStrInfo(id.Which(), tid);
1136 CSeq_id_Handle::TVariant variant = info? info->ParseCaseVariant(tid): 0;
1137 return CSeq_id_Handle(info, 0, variant);
1138 }
1139
FindOrCreate(const CSeq_id & id)1140 CSeq_id_Handle CSeq_id_Textseq_Tree::FindOrCreate(const CSeq_id& id)
1141 {
1142 _ASSERT(x_Check(id));
1143 const CTextseq_id& tid = x_Get(id);
1144 if ( s_PackTextidEnabled() &&
1145 tid.IsSetAccession() && !tid.IsSetName() && !tid.IsSetRelease() ) {
1146 const string& acc = tid.GetAccession();
1147 TPackedKey key = CSeq_id_Textseq_Info::ParseAcc(acc, tid);
1148 if ( key ) {
1149 TPacked packed = CSeq_id_Textseq_Info::Pack(key, tid);
1150 CSeq_id_Handle::TVariant variant = 0;
1151 TWriteLockGuard guard(m_TreeLock);
1152 TPackedMap_I it = m_PackedMap.lower_bound(key);
1153 if ( it == m_PackedMap.end() ||
1154 m_PackedMap.key_comp()(key, it->first) ) {
1155 CConstRef<CSeq_id_Textseq_Info> info
1156 (new CSeq_id_Textseq_Info(id.Which(), m_Mapper, key));
1157 it = m_PackedMap.insert(it, TPackedMapValue(key, info));
1158 }
1159 else {
1160 variant = it->first.ParseCaseVariant(acc);
1161 }
1162 return CSeq_id_Handle(it->second, packed, variant);
1163 }
1164 }
1165 TWriteLockGuard guard(m_TreeLock);
1166 CSeq_id_Textseq_PlainInfo* info = x_FindStrInfo(id.Which(), tid);
1167 CSeq_id_Handle::TVariant variant = 0;
1168 if ( !info ) {
1169 CRef<CSeq_id> ref_id(new CSeq_id);
1170 s_AssignSeq_id(*ref_id, id);
1171 info = new CSeq_id_Textseq_PlainInfo(ref_id, m_Mapper);
1172 if ( tid.IsSetAccession() ) {
1173 m_ByAcc.insert(TStringMapValue(tid.GetAccession(), info));
1174 }
1175 if ( tid.IsSetName() ) {
1176 m_ByName.insert(TStringMapValue(tid.GetName(), info));
1177 }
1178 }
1179 else {
1180 variant = info->ParseCaseVariant(tid);
1181 }
1182 return CSeq_id_Handle(info, 0, variant);
1183 }
1184
1185
x_Erase(TStringMap & str_map,const string & key,const CSeq_id_Info * info)1186 void CSeq_id_Textseq_Tree::x_Erase(TStringMap& str_map,
1187 const string& key,
1188 const CSeq_id_Info* info)
1189 {
1190 for ( TStringMap::iterator it = str_map.find(key);
1191 it != str_map.end() && NStr::EqualNocase(it->first, key);
1192 ++it ) {
1193 if ( it->second == info ) {
1194 str_map.erase(it);
1195 return;
1196 }
1197 }
1198 }
1199
1200
x_Unindex(const CSeq_id_Info * info)1201 void CSeq_id_Textseq_Tree::x_Unindex(const CSeq_id_Info* info)
1202 {
1203 if ( !m_PackedMap.empty() ) {
1204 const CSeq_id_Textseq_Info* sinfo =
1205 dynamic_cast<const CSeq_id_Textseq_Info*>(info);
1206 if ( sinfo ) {
1207 m_PackedMap.erase(sinfo->GetKey());
1208 return;
1209 }
1210 }
1211 CConstRef<CSeq_id> tid_id = info->GetSeqId();
1212 _ASSERT(x_Check(*tid_id));
1213 const CTextseq_id& tid = x_Get(*tid_id);
1214 if ( tid.IsSetAccession() ) {
1215 x_Erase(m_ByAcc, tid.GetAccession(), info);
1216 }
1217 if ( tid.IsSetName() ) {
1218 x_Erase(m_ByName, tid.GetName(), info);
1219 }
1220 }
1221
1222
1223 static inline
x_IsDefaultSwissprotRelease(const string & release)1224 bool x_IsDefaultSwissprotRelease(const string& release)
1225 {
1226 return release == "reviewed" || release == "unreviewed";
1227 }
1228
1229
x_FindMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const TVersion * ver) const1230 void CSeq_id_Textseq_Tree::x_FindMatchByAcc(TSeq_id_MatchList& id_list,
1231 const string& acc,
1232 const TVersion* ver) const
1233 {
1234 if ( !m_PackedMap.empty() ) {
1235 if ( TPackedKey key = CSeq_id_Textseq_Info::ParseAcc(acc, ver) ) {
1236 if ( key.IsSetVersion() ) {
1237 // only same version
1238 TPackedMap_CI it = m_PackedMap.find(key);
1239 if ( it != m_PackedMap.end() ) {
1240 TPacked packed = CSeq_id_Textseq_Info::Pack(key, acc);
1241 id_list.insert(CSeq_id_Handle(it->second, packed));
1242 }
1243 }
1244 else {
1245 // all versions
1246 TPacked packed = 0;
1247 for ( TPackedMap_CI it = m_PackedMap.lower_bound(key);
1248 it != m_PackedMap.end() && it->first.SameHashNoVer(key);
1249 ++it ) {
1250 if ( it->first.EqualAcc(key) ) {
1251 if ( packed == 0 ) {
1252 packed = CSeq_id_Textseq_Info::Pack(key, acc);
1253 }
1254 _ASSERT(packed==CSeq_id_Textseq_Info::Pack(key, acc));
1255 id_list.insert(CSeq_id_Handle(it->second, packed));
1256 }
1257 }
1258 }
1259 }
1260 }
1261
1262 for ( TStringMapCI vit = m_ByAcc.find(acc);
1263 vit != m_ByAcc.end() && NStr::EqualNocase(vit->first, acc);
1264 ++vit ) {
1265 if ( ver ) {
1266 CConstRef<CSeq_id> tst_id = vit->second->GetSeqId();
1267 const CTextseq_id& tst = x_Get(*tst_id);
1268 // acc.ver should match
1269 if ( !tst.IsSetVersion() || tst.GetVersion() != *ver ) {
1270 continue;
1271 }
1272 }
1273 id_list.insert(CSeq_id_Handle(vit->second));
1274 }
1275 }
1276
1277
1278 void
x_FindRevMatchByAccPacked(TSeq_id_MatchList & id_list,const string & acc,const TVersion * ver) const1279 CSeq_id_Textseq_Tree::x_FindRevMatchByAccPacked(TSeq_id_MatchList& id_list,
1280 const string& acc,
1281 const TVersion* ver) const
1282 {
1283 if ( !m_PackedMap.empty() ) {
1284 if ( TPackedKey key = CSeq_id_Textseq_Info::ParseAcc(acc, ver) ) {
1285 TPackedMap_CI it = m_PackedMap.find(key);
1286 if ( it != m_PackedMap.end() ) {
1287 TPacked packed = CSeq_id_Textseq_Info::Pack(key, acc);
1288 id_list.insert(CSeq_id_Handle(it->second, packed));
1289 }
1290 if ( key.IsSetVersion() ) {
1291 // no version too
1292 key.ResetVersion();
1293 TPackedMap_CI itm = m_PackedMap.find(key);
1294 if ( itm != m_PackedMap.end() ) {
1295 TPacked packed = CSeq_id_Textseq_Info::Pack(key, acc);
1296 id_list.insert(CSeq_id_Handle(itm->second, packed));
1297 }
1298 }
1299 }
1300 }
1301 }
1302
1303
1304 void
x_FindRevMatchByAccNonPacked(TSeq_id_MatchList & id_list,const string & acc,const TVersion * ver) const1305 CSeq_id_Textseq_Tree::x_FindRevMatchByAccNonPacked(TSeq_id_MatchList& id_list,
1306 const string& acc,
1307 const TVersion* ver) const
1308 {
1309 for ( TStringMapCI vit = m_ByAcc.find(acc);
1310 vit != m_ByAcc.end() && NStr::EqualNocase(vit->first, acc);
1311 ++vit ) {
1312 CConstRef<CSeq_id> tst_id = vit->second->GetSeqId();
1313 const CTextseq_id& tst = x_Get(*tst_id);
1314 if ( tst.IsSetVersion() &&
1315 (!ver || tst.GetVersion() != *ver) ) {
1316 continue;
1317 }
1318 id_list.insert(CSeq_id_Handle(vit->second));
1319 }
1320 }
1321
1322
1323 inline
1324 void
x_FindRevMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const TVersion * ver) const1325 CSeq_id_Textseq_Tree::x_FindRevMatchByAcc(TSeq_id_MatchList& id_list,
1326 const string& acc,
1327 const TVersion* ver) const
1328 {
1329 x_FindRevMatchByAccPacked(id_list, acc, ver);
1330 x_FindRevMatchByAccNonPacked(id_list, acc, ver);
1331 }
1332
1333
x_FindMatchByName(TSeq_id_MatchList & id_list,const string & name,const CTextseq_id * tid) const1334 void CSeq_id_Textseq_Tree::x_FindMatchByName(TSeq_id_MatchList& id_list,
1335 const string& name,
1336 const CTextseq_id* tid) const
1337 {
1338 for ( TStringMapCI vit = m_ByName.find(name);
1339 vit != m_ByName.end() && NStr::EqualNocase(vit->first, name);
1340 ++vit ) {
1341 if ( tid ) {
1342 CConstRef<CSeq_id> tst_id = vit->second->GetSeqId();
1343 const CTextseq_id& tst = x_Get(*tst_id);
1344 // name.rel should match
1345 if ( tst.IsSetAccession() && tid->IsSetAccession() ) {
1346 // both accessions are set.
1347 // if they are the same - match will be found by accession,
1348 // otherwise accessions are different and there is no match.
1349 continue;
1350 }
1351 if ( tid->IsSetRelease() ) {
1352 if ( tst.IsSetRelease() ||
1353 !(m_Type == CSeq_id::e_Swissprot &&
1354 x_IsDefaultSwissprotRelease(tid->GetRelease())) ) {
1355 if ( !tst.IsSetRelease() ||
1356 tst.GetRelease() != tid->GetRelease() ) {
1357 continue;
1358 }
1359 }
1360 }
1361 }
1362 id_list.insert(CSeq_id_Handle(vit->second));
1363 }
1364 }
1365
1366
x_FindRevMatchByName(TSeq_id_MatchList &,const string &,const CTextseq_id *) const1367 void CSeq_id_Textseq_Tree::x_FindRevMatchByName(TSeq_id_MatchList& /*id_list*/,
1368 const string& /*name*/,
1369 const CTextseq_id* /*tid*/) const
1370 {
1371 /*
1372 for ( TStringMapCI vit = m_ByName.find(name);
1373 vit != m_ByName.end() && NStr::EqualNocase(vit->first, name);
1374 ++vit ) {
1375 if ( tid ) {
1376 CConstRef<CSeq_id> tst_id = vit->second->GetSeqId();
1377 const CTextseq_id& tst = x_Get(*tst_id);
1378 // name.rel should match
1379 if ( tst.IsSetAccession() && tid->IsSetAccession() ) {
1380 // both accessions are set.
1381 // if they are the same - match will be found by accession,
1382 // otherwise accessions are different and there is no match.
1383 continue;
1384 }
1385 if ( tid->IsSetRelease() ) {
1386 if ( tst.IsSetRelease() ||
1387 !(m_Type == CSeq_id::e_Swissprot &&
1388 x_IsDefaultSwissprotRelease(tid->GetRelease())) ) {
1389 if ( !tst.IsSetRelease() ||
1390 tst.GetRelease() != tid->GetRelease() ) {
1391 continue;
1392 }
1393 }
1394 }
1395 }
1396 id_list.insert(CSeq_id_Handle(vit->second));
1397 }
1398 */
1399 }
1400
1401
HaveMatch(const CSeq_id_Handle &) const1402 bool CSeq_id_Textseq_Tree::HaveMatch(const CSeq_id_Handle& ) const
1403 {
1404 return true;
1405 }
1406
1407
FindMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list) const1408 void CSeq_id_Textseq_Tree::FindMatch(const CSeq_id_Handle& id,
1409 TSeq_id_MatchList& id_list) const
1410 {
1411 bool mine = x_Check(id.Which());
1412 if ( mine ) {
1413 id_list.insert(id);
1414 }
1415 TReadLockGuard guard(m_TreeLock);
1416 if ( id.IsPacked() ) {
1417 const CSeq_id_Textseq_Info* info =
1418 static_cast<const CSeq_id_Textseq_Info*>(GetInfo(id));
1419 if ( !m_ByAcc.empty() ) {
1420 // potentially whole search
1421 TStringMapCI it = m_ByAcc.lower_bound(info->GetAccPrefix());
1422 if ( it != m_ByAcc.end() && info->GoodPrefix(it->first) ) {
1423 // have similar accessions
1424 CTextseq_id tid;
1425 info->Restore(tid, id.GetPacked(), id.GetVariant());
1426 x_FindMatchByAcc(id_list, tid.GetAccession(), &tid);
1427 // x_FindMatchByAcc will search packed accessions too
1428 return;
1429 }
1430 }
1431 // only packed search -> no need to decode
1432 if ( !mine ) { // weak matching
1433 TPackedMap_CI iter = m_PackedMap.find(info->GetKey());
1434 if ( iter != m_PackedMap.end() ) {
1435 id_list.insert(CSeq_id_Handle(iter->second, id.GetPacked(), id.GetVariant()));
1436 }
1437 }
1438 if ( !info->IsSetVersion() ) {
1439 // add all known versions
1440 const TPackedKey& key = info->GetKey();
1441 for ( TPackedMap_CI it = m_PackedMap.lower_bound(key);
1442 it != m_PackedMap.end() && it->first.SameHashNoVer(key);
1443 ++it ) {
1444 if ( it->first.EqualAcc(key) ) {
1445 id_list.insert(CSeq_id_Handle(it->second, id.GetPacked(), id.GetVariant()));
1446 }
1447 }
1448 }
1449 }
1450 else {
1451 CConstRef<CSeq_id> tid_id = id.GetSeqId();
1452 const CTextseq_id* tid = tid_id->GetTextseq_Id();
1453 _ASSERT(tid);
1454 if ( tid->IsSetAccession() ) {
1455 x_FindMatchByAcc(id_list, tid->GetAccession(), tid);
1456 }
1457 if ( tid->IsSetName() ) {
1458 x_FindMatchByName(id_list, tid->GetName(), tid);
1459 }
1460 }
1461 }
1462
1463
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const1464 void CSeq_id_Textseq_Tree::FindMatchStr(const string& sid,
1465 TSeq_id_MatchList& id_list) const
1466 {
1467 TReadLockGuard guard(m_TreeLock);
1468 // ignore '.' in the search string - cut it out
1469 SIZE_TYPE dot = sid.find('.');
1470 if ( dot != NPOS ) {
1471 string acc = sid.substr(0, dot);
1472 x_FindMatchByAcc(id_list, acc);
1473 x_FindMatchByName(id_list, acc);
1474 }
1475 else {
1476 x_FindMatchByAcc(id_list, sid);
1477 x_FindMatchByName(id_list, sid);
1478 }
1479 }
1480
1481
Match(const CSeq_id_Handle & h1,const CSeq_id_Handle & h2) const1482 bool CSeq_id_Textseq_Tree::Match(const CSeq_id_Handle& h1,
1483 const CSeq_id_Handle& h2) const
1484 {
1485 return CSeq_id_Which_Tree::Match(h1, h2);
1486 }
1487
1488
1489 inline
x_GetVersion(int & version,const CSeq_id_Handle & id) const1490 bool CSeq_id_Textseq_Tree::x_GetVersion(int& version,
1491 const CSeq_id_Handle& id) const
1492 {
1493 if ( id.IsPacked() ) {
1494 const CSeq_id_Textseq_Info* info =
1495 static_cast<const CSeq_id_Textseq_Info*>(GetInfo(id));
1496 if ( !info->IsSetVersion() ) {
1497 version = 0;
1498 return false;
1499 }
1500 version = info->GetVersion();
1501 return true;
1502 }
1503 else {
1504 CConstRef<CSeq_id> id1 = id.GetSeqId();
1505 const CTextseq_id* tid1 = id1->GetTextseq_Id();
1506 if ( !tid1->IsSetVersion() ) {
1507 version = 0;
1508 return false;
1509 }
1510 version = tid1->GetVersion();
1511 return true;
1512 }
1513 }
1514
1515
IsBetterVersion(const CSeq_id_Handle & h1,const CSeq_id_Handle & h2) const1516 bool CSeq_id_Textseq_Tree::IsBetterVersion(const CSeq_id_Handle& h1,
1517 const CSeq_id_Handle& h2) const
1518 {
1519 // Compare versions. If only one of the two ids has version,
1520 // consider it as better.
1521 int version1, version2;
1522 return x_GetVersion(version1, h1) &&
1523 (!x_GetVersion(version2, h2) || version1 > version2);
1524 }
1525
1526
HaveReverseMatch(const CSeq_id_Handle &) const1527 bool CSeq_id_Textseq_Tree::HaveReverseMatch(const CSeq_id_Handle&) const
1528 {
1529 return true;
1530 }
1531
1532
FindReverseMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list)1533 void CSeq_id_Textseq_Tree::FindReverseMatch(const CSeq_id_Handle& id,
1534 TSeq_id_MatchList& id_list)
1535 {
1536 bool mine = x_Check(id.Which());
1537 if ( mine ) {
1538 id_list.insert(id);
1539 }
1540 if ( id.IsPacked() ) {
1541 TReadLockGuard guard(m_TreeLock);
1542 const CSeq_id_Textseq_Info* info =
1543 static_cast<const CSeq_id_Textseq_Info*>(GetInfo(id));
1544 if ( !mine ) { // weak matching
1545 TPackedMap_CI iter = m_PackedMap.find(info->GetKey());
1546 if ( iter != m_PackedMap.end() ) {
1547 id_list.insert(CSeq_id_Handle(iter->second, id.GetPacked(), id.GetVariant()));
1548 }
1549 }
1550 if ( info->IsSetVersion() ) {
1551 TPackedKey key = info->GetKey();
1552 key.ResetVersion();
1553 TPackedMap_CI it = m_PackedMap.find(key);
1554 if ( it != m_PackedMap.end() ) {
1555 id_list.insert(CSeq_id_Handle(it->second, id.GetPacked(), id.GetVariant()));
1556 }
1557 }
1558 if ( !m_ByAcc.empty() ) {
1559 // look for non-packed variants that may have set name or revision
1560 string acc;
1561 info->RestoreAccession(acc, id.GetPacked(), id.GetVariant());
1562 x_FindRevMatchByAccNonPacked
1563 (id_list, acc, info->IsSetVersion()? &info->GetVersion(): 0);
1564 }
1565 return;
1566 }
1567
1568 CConstRef<CSeq_id> orig_id = id.GetSeqId();
1569 const CTextseq_id& orig_tid = x_Get(*orig_id);
1570
1571 if ( true || !mine ) { // this code should be enough
1572 TReadLockGuard guard(m_TreeLock);
1573 // search only existing accessions
1574 if ( orig_tid.IsSetAccession() ) {
1575 x_FindRevMatchByAcc(id_list, orig_tid.GetAccession(), &orig_tid);
1576 }
1577 if ( orig_tid.IsSetName() ) {
1578 x_FindRevMatchByName(id_list, orig_tid.GetName(), &orig_tid);
1579 }
1580 return;
1581 }
1582 }
1583
1584
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const1585 size_t CSeq_id_Textseq_Tree::Dump(CNcbiOstream& out,
1586 CSeq_id::E_Choice type,
1587 int details) const
1588 {
1589 size_t total_bytes = 0;
1590 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
1591 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): "<<endl;
1592 }
1593 {{
1594 size_t size = m_ByAcc.size() + m_ByName.size();
1595 size_t elem_size = 0, extra_size = 0;
1596 if ( size ) {
1597 elem_size = sizeof(string)+sizeof(void*); // map value
1598 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
1599 elem_size += sizeof(CSeq_id_Info); //
1600 elem_size += sizeof(CSeq_id); //
1601 elem_size += sizeof(CTextseq_id); //
1602 // malloc overhead:
1603 // map value, CSeq_id_Info, CSeq_id, CTextseq_id
1604 elem_size += 4*kMallocOverhead;
1605 ITERATE ( TStringMap, it, m_ByAcc ) {
1606 CConstRef<CSeq_id> id_id = it->second->GetSeqId();
1607 const CTextseq_id& id = *id_id->GetTextseq_Id();
1608 extra_size += sx_StringMemory(id.GetAccession());
1609 if ( id.IsSetName() ) {
1610 extra_size += sx_StringMemory(id.GetName());
1611 }
1612 if ( id.IsSetRelease() ) {
1613 extra_size += sx_StringMemory(id.GetRelease());
1614 }
1615 }
1616 }
1617 size_t bytes = extra_size + size*elem_size;
1618 total_bytes += bytes;
1619 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
1620 out << " "<<size << " handles, "<<bytes<<" bytes"<<endl;
1621 }
1622 }}
1623 {{
1624 size_t size = m_PackedMap.size(), elem_size = 0, extra_size = 0;
1625 if ( size ) {
1626 elem_size = sizeof(TPackedKey)+sizeof(void*);
1627 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
1628 elem_size += sizeof(CSeq_id_Textseq_Info); //
1629 // malloc overhead:
1630 // map value, CSeq_id_Textseq_Info
1631 elem_size += 2*kMallocOverhead;
1632 ITERATE ( TPackedMap, it, m_PackedMap ) {
1633 //extra_size += sx_StringMemory(it->first.m_Prefix);
1634 }
1635 }
1636 size_t bytes = extra_size + size*elem_size;
1637 total_bytes += bytes;
1638 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
1639 out << " " <<size << " packed handles, "<<bytes<<" bytes"<<endl;
1640 }
1641 }}
1642 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
1643 ITERATE ( TStringMap, it, m_ByAcc ) {
1644 CConstRef<CSeq_id> id = it->second->GetSeqId();
1645 out << " " << id->AsFastaString() << endl;
1646 }
1647 ITERATE ( TPackedMap, it, m_PackedMap ) {
1648 out << " packed prefix "
1649 << it->first.GetAccPrefix()<<"."<<it->first.m_Version << endl;
1650 }
1651 }
1652 return total_bytes;
1653 }
1654
1655 /////////////////////////////////////////////////////////////////////////////
1656 // CSeq_id_GB_Tree
1657 /////////////////////////////////////////////////////////////////////////////
1658
CSeq_id_GB_Tree(CSeq_id_Mapper * mapper)1659 CSeq_id_GB_Tree::CSeq_id_GB_Tree(CSeq_id_Mapper* mapper)
1660 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_not_set)
1661 {
1662 }
1663
1664
x_Check(const CSeq_id::E_Choice & type) const1665 bool CSeq_id_GB_Tree::x_Check(const CSeq_id::E_Choice& type) const
1666 {
1667 return
1668 type == CSeq_id::e_Genbank ||
1669 type == CSeq_id::e_Embl ||
1670 type == CSeq_id::e_Ddbj;
1671 }
1672
1673
1674 /////////////////////////////////////////////////////////////////////////////
1675 // CSeq_id_Pir_Tree
1676 /////////////////////////////////////////////////////////////////////////////
1677
CSeq_id_Pir_Tree(CSeq_id_Mapper * mapper)1678 CSeq_id_Pir_Tree::CSeq_id_Pir_Tree(CSeq_id_Mapper* mapper)
1679 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Pir)
1680 {
1681 }
1682
1683
1684 /////////////////////////////////////////////////////////////////////////////
1685 // CSeq_id_Swissprot_Tree
1686 /////////////////////////////////////////////////////////////////////////////
1687
CSeq_id_Swissprot_Tree(CSeq_id_Mapper * mapper)1688 CSeq_id_Swissprot_Tree::CSeq_id_Swissprot_Tree(CSeq_id_Mapper* mapper)
1689 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Swissprot)
1690 {
1691 }
1692
1693
1694 /////////////////////////////////////////////////////////////////////////////
1695 // CSeq_id_Prf_Tree
1696 /////////////////////////////////////////////////////////////////////////////
1697
CSeq_id_Prf_Tree(CSeq_id_Mapper * mapper)1698 CSeq_id_Prf_Tree::CSeq_id_Prf_Tree(CSeq_id_Mapper* mapper)
1699 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Prf)
1700 {
1701 }
1702
1703
1704 /////////////////////////////////////////////////////////////////////////////
1705 // CSeq_id_Tpg_Tree
1706 /////////////////////////////////////////////////////////////////////////////
1707
CSeq_id_Tpg_Tree(CSeq_id_Mapper * mapper)1708 CSeq_id_Tpg_Tree::CSeq_id_Tpg_Tree(CSeq_id_Mapper* mapper)
1709 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Tpg)
1710 {
1711 }
1712
1713
1714 /////////////////////////////////////////////////////////////////////////////
1715 // CSeq_id_Tpe_Tree
1716 /////////////////////////////////////////////////////////////////////////////
1717
CSeq_id_Tpe_Tree(CSeq_id_Mapper * mapper)1718 CSeq_id_Tpe_Tree::CSeq_id_Tpe_Tree(CSeq_id_Mapper* mapper)
1719 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Tpe)
1720 {
1721 }
1722
1723
1724 /////////////////////////////////////////////////////////////////////////////
1725 // CSeq_id_Tpd_Tree
1726 /////////////////////////////////////////////////////////////////////////////
1727
CSeq_id_Tpd_Tree(CSeq_id_Mapper * mapper)1728 CSeq_id_Tpd_Tree::CSeq_id_Tpd_Tree(CSeq_id_Mapper* mapper)
1729 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Tpd)
1730 {
1731 }
1732
1733
1734 /////////////////////////////////////////////////////////////////////////////
1735 // CSeq_id_Gpipe_Tree
1736 /////////////////////////////////////////////////////////////////////////////
1737
CSeq_id_Gpipe_Tree(CSeq_id_Mapper * mapper)1738 CSeq_id_Gpipe_Tree::CSeq_id_Gpipe_Tree(CSeq_id_Mapper* mapper)
1739 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Gpipe)
1740 {
1741 }
1742
1743
1744 /////////////////////////////////////////////////////////////////////////////
1745 // CSeq_id_Named_annot_track_Tree
1746 /////////////////////////////////////////////////////////////////////////////
1747
CSeq_id_Named_annot_track_Tree(CSeq_id_Mapper * mapper)1748 CSeq_id_Named_annot_track_Tree::CSeq_id_Named_annot_track_Tree(CSeq_id_Mapper* mapper)
1749 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Named_annot_track)
1750 {
1751 }
1752
1753
1754 /////////////////////////////////////////////////////////////////////////////
1755 // CSeq_id_Other_Tree
1756 /////////////////////////////////////////////////////////////////////////////
1757
CSeq_id_Other_Tree(CSeq_id_Mapper * mapper)1758 CSeq_id_Other_Tree::CSeq_id_Other_Tree(CSeq_id_Mapper* mapper)
1759 : CSeq_id_Textseq_Tree(mapper, CSeq_id::e_Other)
1760 {
1761 }
1762
1763
1764 /////////////////////////////////////////////////////////////////////////////
1765 // CSeq_id_Local_Tree
1766 /////////////////////////////////////////////////////////////////////////////
1767
1768
CSeq_id_Local_Tree(CSeq_id_Mapper * mapper)1769 CSeq_id_Local_Tree::CSeq_id_Local_Tree(CSeq_id_Mapper* mapper)
1770 : CSeq_id_Which_Tree(mapper)
1771 {
1772 }
1773
1774
~CSeq_id_Local_Tree(void)1775 CSeq_id_Local_Tree::~CSeq_id_Local_Tree(void)
1776 {
1777 }
1778
1779
Empty(void) const1780 bool CSeq_id_Local_Tree::Empty(void) const
1781 {
1782 return m_ByStr.empty() && m_ById.empty();
1783 }
1784
1785
sx_AllDigits(const string & s)1786 static inline bool sx_AllDigits(const string& s)
1787 {
1788 ITERATE ( string, i, s ) {
1789 if ( !isdigit(Uint1(*i)) ) {
1790 return false;
1791 }
1792 }
1793 return true;
1794 }
1795
1796
sx_ParseLocalStrId(const string & str,CObject_id::TId & id)1797 static bool sx_ParseLocalStrId(const string& str, CObject_id::TId& id)
1798 {
1799 CObject_id::TId value = NStr::StringToNumeric<CObject_id::TId>(str, NStr::fConvErr_NoThrow);
1800 if ( !value ) {
1801 if ( errno ) {
1802 // not convertible to integer
1803 return false;
1804 }
1805 // converted to 0
1806 if ( str.size() != 1 ) {
1807 // leading zeroes are not allowed
1808 return false;
1809 }
1810 // valid zero as a string
1811 id = 0;
1812 return true;
1813 }
1814 else if ( value > 0 ) {
1815 // non-zero positive value
1816 if ( str[0] == '0' || str[0] == '+' ) {
1817 // redundant '+' or leading zeroes are not allowed
1818 return false;
1819 }
1820 // valid positive as a string
1821 id = value;
1822 return true;
1823 }
1824 else {
1825 // non-zero negative value
1826 if ( str[0] != '-' || str[1] == '0' ) {
1827 // leading zeroes are not allowed
1828 return false;
1829 }
1830 // valid negative as a string
1831 id = value;
1832 return true;
1833 }
1834 }
1835
1836
CSeq_id_Local_Info(const CObject_id & oid,CSeq_id_Mapper * mapper)1837 CSeq_id_Local_Info::CSeq_id_Local_Info(const CObject_id& oid, CSeq_id_Mapper* mapper)
1838 : CSeq_id_Info(CSeq_id::e_Local, mapper),
1839 m_IsId(oid.IsId())
1840 {
1841 CRef<CSeq_id> seq_id(new CSeq_id);
1842 CObject_id& oid2 = seq_id->SetLocal();
1843 if ( IsId() ) {
1844 m_HasMatchingId = true;
1845 m_MatchingId = oid.GetId();
1846 oid2.SetId(oid.GetId());
1847 }
1848 else {
1849 m_HasMatchingId = sx_ParseLocalStrId(oid.GetStr(), m_MatchingId);
1850 oid2.SetStr(oid.GetStr());
1851 }
1852 m_Seq_id = move(seq_id);
1853 }
1854
1855
~CSeq_id_Local_Info()1856 CSeq_id_Local_Info::~CSeq_id_Local_Info()
1857 {
1858 }
1859
1860
1861 inline CSeq_id_Handle::TVariant
ParseCaseVariant(const string & str) const1862 CSeq_id_Local_Info::ParseCaseVariant(const string& str) const
1863 {
1864 return s_ParseCaseVariant(m_Seq_id->GetLocal().GetStr(), str).first;
1865 }
1866
1867
1868 inline CSeq_id_Handle::TVariant
ParseCaseVariant(const CObject_id & oid) const1869 CSeq_id_Local_Info::ParseCaseVariant(const CObject_id& oid) const
1870 {
1871 if ( !oid.IsStr() ) {
1872 return 0;
1873 }
1874 return ParseCaseVariant(oid.GetStr());
1875 }
1876
1877
GetPackedSeqId(TPacked packed,TVariant variant) const1878 CConstRef<CSeq_id> CSeq_id_Local_Info::GetPackedSeqId(TPacked packed, TVariant variant) const
1879 {
1880 if ( !variant ) {
1881 return m_Seq_id;
1882 }
1883 CRef<CSeq_id> ret(new CSeq_id);
1884 const CObject_id& src = m_Seq_id->GetLocal();
1885 CObject_id& oid = ret->SetLocal();
1886 if ( IsId() ) {
1887 oid.SetId(src.GetId());
1888 }
1889 else {
1890 string& str = oid.SetStr();
1891 str = src.GetStr();
1892 s_RestoreCaseVariant(str, variant);
1893 }
1894 return ret;
1895 }
1896
1897
x_FindStrInfo(const string & str) const1898 CSeq_id_Local_Info* CSeq_id_Local_Tree::x_FindStrInfo(const string& str) const
1899 {
1900 TByStr::const_iterator it = m_ByStr.find(str);
1901 if ( it != m_ByStr.end() ) {
1902 return it->second;
1903 }
1904 // Not found
1905 return 0;
1906 }
1907
1908
x_FindIdInfo(CObject_id::TId id) const1909 CSeq_id_Local_Info* CSeq_id_Local_Tree::x_FindIdInfo(CObject_id::TId id) const
1910 {
1911 TById::const_iterator it = m_ById.find(id);
1912 if ( it != m_ById.end() ) {
1913 return it->second;
1914 }
1915 // Not found
1916 return 0;
1917 }
1918
1919
x_FindInfo(const CObject_id & oid) const1920 CSeq_id_Local_Info* CSeq_id_Local_Tree::x_FindInfo(const CObject_id& oid) const
1921 {
1922 if ( oid.IsStr() ) {
1923 return x_FindStrInfo(oid.GetStr());
1924 }
1925 else {
1926 return x_FindIdInfo(oid.GetId());
1927 }
1928 }
1929
1930
FindInfo(const CSeq_id & id) const1931 CSeq_id_Handle CSeq_id_Local_Tree::FindInfo(const CSeq_id& id) const
1932 {
1933 _ASSERT( id.IsLocal() );
1934 const CObject_id& oid = id.GetLocal();
1935 TReadLockGuard guard(m_TreeLock);
1936 CSeq_id_Local_Info* info = x_FindInfo(oid);
1937 CSeq_id_Handle::TVariant variant = info? info->ParseCaseVariant(oid): 0;
1938 return CSeq_id_Handle(info, 0, variant);
1939 }
1940
1941
FindOrCreate(const CSeq_id & id)1942 CSeq_id_Handle CSeq_id_Local_Tree::FindOrCreate(const CSeq_id& id)
1943 {
1944 const CObject_id& oid = id.GetLocal();
1945 TWriteLockGuard guard(m_TreeLock);
1946 CSeq_id_Local_Info*& info = oid.IsStr()? m_ByStr[oid.GetStr()]: m_ById[oid.GetId()];
1947 CSeq_id_Handle::TVariant variant = 0;
1948 if ( !info ) {
1949 info = new CSeq_id_Local_Info(oid, m_Mapper);
1950 }
1951 else {
1952 variant = info->ParseCaseVariant(oid);
1953 }
1954 return CSeq_id_Handle(info, 0, variant);
1955 }
1956
1957
x_Unindex(const CSeq_id_Info * info)1958 void CSeq_id_Local_Tree::x_Unindex(const CSeq_id_Info* info)
1959 {
1960 CConstRef<CSeq_id> id = info->GetSeqId();
1961 _ASSERT(id->IsLocal());
1962 const CObject_id& oid = id->GetLocal();
1963
1964 if ( oid.IsStr() ) {
1965 _VERIFY(m_ByStr.erase(oid.GetStr()));
1966 }
1967 else if ( oid.IsId() ) {
1968 _VERIFY(m_ById.erase(oid.GetId()));
1969 }
1970 }
1971
1972
HaveMatch(const CSeq_id_Handle & id) const1973 bool CSeq_id_Local_Tree::HaveMatch(const CSeq_id_Handle& id) const
1974 {
1975 // match id <-> str(number)
1976 const CSeq_id_Local_Info* sinfo =
1977 static_cast<const CSeq_id_Local_Info*>(id.x_GetInfo());
1978 return sinfo->IsId() || sinfo->HasMatchingId();
1979 }
1980
1981
FindMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list) const1982 void CSeq_id_Local_Tree::FindMatch(const CSeq_id_Handle& id,
1983 TSeq_id_MatchList& id_list) const
1984 {
1985 id_list.insert(id);
1986 // match id <-> str(number)
1987 const CSeq_id_Local_Info* sinfo =
1988 static_cast<const CSeq_id_Local_Info*>(id.x_GetInfo());
1989 TReadLockGuard guard(m_TreeLock);
1990 if ( sinfo->IsId() ) {
1991 // id -> str
1992 if ( CSeq_id_Info* id2 = x_FindStrInfo(NStr::NumericToString(sinfo->GetMatchingId())) ) {
1993 id_list.insert(CSeq_id_Handle(id2));
1994 }
1995 }
1996 else if ( sinfo->HasMatchingId() ) {
1997 // str -> id
1998 if ( CSeq_id_Info* id2 = x_FindIdInfo(sinfo->GetMatchingId()) ) {
1999 id_list.insert(CSeq_id_Handle(id2));
2000 }
2001 }
2002 }
2003
2004
FindMatchStr(const string & str,TSeq_id_MatchList & id_list) const2005 void CSeq_id_Local_Tree::FindMatchStr(const string& str,
2006 TSeq_id_MatchList& id_list) const
2007 {
2008 CObject_id::TId id;
2009 bool has_matching_id = sx_ParseLocalStrId(str, id);
2010 TReadLockGuard guard(m_TreeLock);
2011 // In any case search in strings
2012 if ( CSeq_id_Info* id2 = x_FindStrInfo(str) ) {
2013 id_list.insert(CSeq_id_Handle(id2));
2014 }
2015 // search possible int match
2016 if ( has_matching_id ) {
2017 if ( CSeq_id_Info* id2 = x_FindIdInfo(id) ) {
2018 id_list.insert(CSeq_id_Handle(id2));
2019 }
2020 }
2021 }
2022
2023
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const2024 size_t CSeq_id_Local_Tree::Dump(CNcbiOstream& out,
2025 CSeq_id::E_Choice type,
2026 int details) const
2027 {
2028 size_t total_bytes = 0;
2029 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2030 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): "<<endl;
2031 }
2032 {{
2033 size_t size = m_ByStr.size(), elem_size = 0, extra_size = 0;
2034 if ( size ) {
2035 elem_size = sizeof(string)+sizeof(void*); // map value
2036 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
2037 elem_size += sizeof(CSeq_id_Info); //
2038 elem_size += sizeof(CSeq_id); //
2039 elem_size += sizeof(CObject_id); //
2040 // malloc overhead:
2041 // map value, CSeq_id_Info, CSeq_id, CObject_id
2042 elem_size += 4*kMallocOverhead;
2043 ITERATE ( TByStr, it, m_ByStr ) {
2044 extra_size += sx_StringMemory(it->first);
2045 }
2046 }
2047 size_t bytes = extra_size + size*elem_size;
2048 total_bytes += bytes;
2049 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2050 out << " " <<size << " str handles, "<<bytes<<" bytes" << endl;
2051 }
2052 }}
2053 {{
2054 size_t size = m_ById.size(), elem_size = 0;
2055 if ( size ) {
2056 elem_size = sizeof(int)+sizeof(void*);
2057 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
2058 elem_size += sizeof(CSeq_id_Info); //
2059 elem_size += sizeof(CSeq_id); //
2060 elem_size += sizeof(CObject_id); //
2061 // malloc overhead:
2062 // map value, CSeq_id_Info, CSeq_id, CObject_id
2063 elem_size += 4*kMallocOverhead;
2064 }
2065 size_t bytes = size*elem_size;
2066 total_bytes += bytes;
2067 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2068 out << " "<<size << " int handles, "<<bytes<<" bytes" << endl;
2069 }
2070 }}
2071 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
2072 ITERATE ( TByStr, it, m_ByStr ) {
2073 out << " " << it->second->GetSeqId()->AsFastaString() << endl;
2074 }
2075 ITERATE ( TById, it, m_ById ) {
2076 out << " " << it->second->GetSeqId()->AsFastaString() << endl;
2077 }
2078 }
2079 return total_bytes;
2080 }
2081
2082 /////////////////////////////////////////////////////////////////////////////
2083 // CSeq_id_General_Id_Info
2084 /////////////////////////////////////////////////////////////////////////////
2085
2086
CSeq_id_General_Id_Info(CSeq_id_Mapper * mapper,const TKey & key)2087 CSeq_id_General_Id_Info::CSeq_id_General_Id_Info(CSeq_id_Mapper* mapper,
2088 const TKey& key)
2089 : CSeq_id_Info(CSeq_id::e_General, mapper),
2090 m_Key(key)
2091 {
2092 }
2093
2094
~CSeq_id_General_Id_Info(void)2095 CSeq_id_General_Id_Info::~CSeq_id_General_Id_Info(void)
2096 {
2097 }
2098
2099
2100 inline
2101 CSeq_id_General_Id_Info::TPacked
Pack(const TKey &,const CDbtag & dbtag)2102 CSeq_id_General_Id_Info::Pack(const TKey& /*key*/, const CDbtag& dbtag)
2103 {
2104 TPacked id = dbtag.GetTag().GetId();
2105 if ( id <= 0 ) {
2106 --id;
2107 }
2108 return id;
2109 }
2110
2111
Restore(CDbtag & dbtag,TPacked param,TVariant variant) const2112 void CSeq_id_General_Id_Info::Restore(CDbtag& dbtag, TPacked param, TVariant variant) const
2113 {
2114 if ( !dbtag.IsSetDb() ) {
2115 dbtag.SetDb(GetDbtag());
2116 }
2117 if ( param < 0 ) {
2118 ++param;
2119 }
2120 dbtag.SetTag().SetId(CObject_id::TId(param));
2121 s_RestoreCaseVariant(dbtag.SetDb(), variant);
2122 }
2123
2124
GetPackedSeqId(TPacked param,TVariant variant) const2125 CConstRef<CSeq_id> CSeq_id_General_Id_Info::GetPackedSeqId(TPacked param, TVariant variant) const
2126 {
2127 CConstRef<CSeq_id> ret;
2128 if ( variant ) {
2129 // all non-initial case variants need fresh Seq-id to start with
2130 ret = new CSeq_id;
2131 }
2132 else {
2133 // otherwise try to use shared Seq-id if it's not referenced anywhere else
2134 typedef CSeq_id_General_Id_Info TThis;
2135 #if defined NCBI_SLOW_ATOMIC_SWAP
2136 CFastMutexGuard guard(sx_GetSeqIdMutex);
2137 ret = m_Seq_id;
2138 const_cast<TThis*>(this)->m_Seq_id.Reset();
2139 if ( !ret || !ret->ReferencedOnlyOnce() ) {
2140 ret.Reset(new CSeq_id);
2141 }
2142 const_cast<TThis*>(this)->m_Seq_id = ret;
2143 #else
2144 const_cast<TThis*>(this)->m_Seq_id.AtomicReleaseTo(ret);
2145 if ( !ret || !ret->ReferencedOnlyOnce() ) {
2146 ret.Reset(new CSeq_id);
2147 }
2148 const_cast<TThis*>(this)->m_Seq_id.AtomicResetFrom(ret);
2149 #endif
2150 }
2151 Restore(const_cast<CSeq_id&>(*ret).SetGeneral(), param, variant);
2152 return ret;
2153 }
2154
2155
2156 /////////////////////////////////////////////////////////////////////////////
2157 // CSeq_id_General_Str_Info
2158 /////////////////////////////////////////////////////////////////////////////
2159
2160
CSeq_id_General_Str_Info(CSeq_id_Mapper * mapper,const TKey & key)2161 CSeq_id_General_Str_Info::CSeq_id_General_Str_Info(CSeq_id_Mapper* mapper,
2162 const TKey& key)
2163 : CSeq_id_Info(CSeq_id::e_General, mapper),
2164 m_Key(key)
2165 {
2166 }
2167
2168
~CSeq_id_General_Str_Info(void)2169 CSeq_id_General_Str_Info::~CSeq_id_General_Str_Info(void)
2170 {
2171 }
2172
2173
2174 inline
ParseCaseVariant(const CDbtag & dbtag) const2175 CSeq_id_Handle::TVariant CSeq_id_General_Str_Info::TKey::ParseCaseVariant(const CDbtag& dbtag) const
2176 {
2177 auto t1 = s_ParseCaseVariant(m_Db, dbtag.GetDb());
2178 const char* str = dbtag.GetTag().GetStr().data();
2179 auto t2 = s_ParseCaseVariant(m_StrPrefix, str, t1.second);
2180 auto t3 = s_ParseCaseVariant(m_StrSuffix, str+m_StrPrefix.size()+GetStrDigits(), t2.second);
2181 return t1.first | t2.first | t3.first;
2182 }
2183
2184
2185 CSeq_id_General_Str_Info::TKey
Parse(const CDbtag & dbtag)2186 CSeq_id_General_Str_Info::Parse(const CDbtag& dbtag)
2187 {
2188 TKey key;
2189 key.m_Key = 0;
2190 const string& str = dbtag.GetTag().GetStr();
2191 size_t len = str.size(), prefix_len = len, str_digits = 0;
2192 // find longest digit substring
2193 size_t cur_digits = 0, total_digits = 0;
2194 for ( ssize_t i = len; i >= 0; ) {
2195 char c = --i < 0? 0: str[i];
2196 if ( c >= '0' && c <= '9' ) {
2197 ++total_digits;
2198 ++cur_digits;
2199 }
2200 else {
2201 if ( !str_digits || cur_digits > str_digits+2 ) {
2202 str_digits = cur_digits;
2203 prefix_len = i+1;
2204 }
2205 cur_digits = 0;
2206 }
2207 }
2208 if ( str_digits > 9 ) {
2209 prefix_len += str_digits - 9;
2210 total_digits += str_digits - 9;
2211 str_digits = 9;
2212 }
2213 if ( str_digits*3 < total_digits*2 ) {
2214 // too many other digits
2215 return key;
2216 }
2217 key.m_Db = dbtag.GetDb();
2218 if ( prefix_len > 0 ) {
2219 key.m_StrPrefix = str.substr(0, prefix_len);
2220 }
2221 if ( prefix_len + str_digits < str.size() ) {
2222 key.m_StrSuffix = str.substr(prefix_len+str_digits);
2223 }
2224 TPacked hash = 1;
2225 if ( 1 ) {
2226 ITERATE(string, i, key.m_Db) {
2227 hash = hash*17 + toupper(Uint1(*i));
2228 }
2229 ITERATE ( string, i, key.m_StrPrefix ) {
2230 hash = hash*17 + toupper(Uint1(*i));
2231 }
2232 ITERATE(string, i, key.m_StrSuffix) {
2233 hash = hash*17 + toupper(Uint1(*i));
2234 }
2235 }
2236 else {
2237 for ( size_t i = 0; i < 3 && i < prefix_len; ++i ) {
2238 hash = (hash << 8) | toupper(key.m_StrPrefix[prefix_len-1-i] & 0xff);
2239 }
2240 }
2241 key.m_Key = (hash << 8) | TPacked(str_digits);
2242 return key;
2243 }
2244
2245
2246 inline
2247 CSeq_id_General_Str_Info::TPacked
Pack(const TKey & key,const CDbtag & dbtag)2248 CSeq_id_General_Str_Info::Pack(const TKey& key,
2249 const CDbtag& dbtag)
2250 {
2251 TPacked id = s_ParseNumber(dbtag.GetTag().GetStr(),
2252 key.m_StrPrefix.size(),
2253 key.GetStrDigits());
2254 if ( id <= 0 ) {
2255 --id;
2256 }
2257 return id;
2258 }
2259
2260
Restore(CDbtag & dbtag,TPacked param,TVariant variant) const2261 void CSeq_id_General_Str_Info::Restore(CDbtag& dbtag, TPacked param, TVariant variant) const
2262 {
2263 if ( !dbtag.IsSetDb() ) {
2264 dbtag.SetDb(GetDbtag());
2265 }
2266 CObject_id& obj_id = dbtag.SetTag();
2267 if ( !obj_id.IsStr() ) {
2268 obj_id.SetStr(GetStrPrefix());
2269 string& str = obj_id.SetStr();
2270 str.resize(str.size() + GetStrDigits(), '0');
2271 if ( !GetStrSuffix().empty() ) {
2272 str += GetStrSuffix();
2273 }
2274 }
2275 if ( param < 0 ) {
2276 ++param;
2277 }
2278 s_RestoreNumber(obj_id.SetStr(), GetStrPrefix().size(), GetStrDigits(), param);
2279 variant = s_RestoreCaseVariant(dbtag.SetDb(), variant);
2280 s_RestoreCaseVariant(obj_id.SetStr(), variant);
2281 }
2282
2283
GetPackedSeqId(TPacked param,TVariant variant) const2284 CConstRef<CSeq_id> CSeq_id_General_Str_Info::GetPackedSeqId(TPacked param, TVariant variant) const
2285 {
2286 CConstRef<CSeq_id> ret;
2287 if ( variant ) {
2288 // all non-initial case variants need fresh Seq-id to start with
2289 ret = new CSeq_id;
2290 }
2291 else {
2292 // otherwise try to use shared Seq-id if it's not referenced anywhere else
2293 typedef CSeq_id_General_Str_Info TThis;
2294 #if defined NCBI_SLOW_ATOMIC_SWAP
2295 CFastMutexGuard guard(sx_GetSeqIdMutex);
2296 ret = m_Seq_id;
2297 const_cast<TThis*>(this)->m_Seq_id.Reset();
2298 if ( !ret || !ret->ReferencedOnlyOnce() ) {
2299 ret.Reset(new CSeq_id);
2300 }
2301 const_cast<TThis*>(this)->m_Seq_id = ret;
2302 #else
2303 const_cast<TThis*>(this)->m_Seq_id.AtomicReleaseTo(ret);
2304 if ( !ret || !ret->ReferencedOnlyOnce() ) {
2305 ret.Reset(new CSeq_id);
2306 }
2307 const_cast<TThis*>(this)->m_Seq_id.AtomicResetFrom(ret);
2308 #endif
2309 }
2310 Restore(const_cast<CSeq_id&>(*ret).SetGeneral(), param, variant);
2311 return ret;
2312 }
2313
2314
2315 /////////////////////////////////////////////////////////////////////////////
2316 // CSeq_id_General_PlainInfo
2317 /////////////////////////////////////////////////////////////////////////////
2318
2319
CSeq_id_General_PlainInfo(const CDbtag & dbid,CSeq_id_Mapper * mapper)2320 CSeq_id_General_PlainInfo::CSeq_id_General_PlainInfo(const CDbtag& dbid, CSeq_id_Mapper* mapper)
2321 : CSeq_id_Info(CSeq_id::e_General, mapper)
2322 {
2323 CRef<CSeq_id> seq_id(new CSeq_id);
2324 s_AssignDbtag(seq_id->SetGeneral(), dbid);
2325 m_Seq_id = move(seq_id);
2326 }
2327
2328
2329 inline
ParseCaseVariant(const CDbtag & dbtag) const2330 CSeq_id_Handle::TVariant CSeq_id_General_PlainInfo::ParseCaseVariant(const CDbtag& dbtag) const
2331 {
2332 const CDbtag& src = m_Seq_id->GetGeneral();
2333 if ( dbtag.GetTag().IsId() ) {
2334 return s_ParseCaseVariant(src.GetDb(), dbtag.GetDb()).first;
2335 }
2336 else {
2337 auto t1 = s_ParseCaseVariant(src.GetDb(), dbtag.GetDb());
2338 auto t2 = s_ParseCaseVariant(src.GetTag().GetStr(), dbtag.GetTag().GetStr(), t1.second);
2339 return t1.first | t2.first;
2340 }
2341 }
2342
2343
GetPackedSeqId(TPacked packed,TVariant variant) const2344 CConstRef<CSeq_id> CSeq_id_General_PlainInfo::GetPackedSeqId(TPacked packed, TVariant variant) const
2345 {
2346 if ( !variant ) {
2347 return m_Seq_id;
2348 }
2349 CRef<CSeq_id> id(new CSeq_id);
2350 CDbtag& dbtag = id->SetGeneral();
2351 s_AssignDbtag(dbtag, m_Seq_id->GetGeneral());
2352 if ( dbtag.GetTag().IsId() ) {
2353 s_RestoreCaseVariant(dbtag.SetDb(), variant);
2354 }
2355 else {
2356 variant = s_RestoreCaseVariant(dbtag.SetDb(), variant);
2357 s_RestoreCaseVariant(dbtag.SetTag().SetStr(), variant);
2358 }
2359 return id;
2360 }
2361
2362 /////////////////////////////////////////////////////////////////////////////
2363 // CSeq_id_General_Tree
2364 /////////////////////////////////////////////////////////////////////////////
2365
2366
CSeq_id_General_Tree(CSeq_id_Mapper * mapper)2367 CSeq_id_General_Tree::CSeq_id_General_Tree(CSeq_id_Mapper* mapper)
2368 : CSeq_id_Which_Tree(mapper)
2369 {
2370 }
2371
2372
~CSeq_id_General_Tree(void)2373 CSeq_id_General_Tree::~CSeq_id_General_Tree(void)
2374 {
2375 }
2376
2377
Empty(void) const2378 bool CSeq_id_General_Tree::Empty(void) const
2379 {
2380 return m_DbMap.empty() && m_PackedIdMap.empty() && m_PackedStrMap.empty();
2381 }
2382
2383
x_FindInfo(const CDbtag & dbid) const2384 CSeq_id_General_PlainInfo* CSeq_id_General_Tree::x_FindInfo(const CDbtag& dbid) const
2385 {
2386 TDbMap::const_iterator db = m_DbMap.find(dbid.GetDb());
2387 if (db == m_DbMap.end())
2388 return 0;
2389 const STagMap& tm = db->second;
2390 const CObject_id& oid = dbid.GetTag();
2391 if ( oid.IsStr() ) {
2392 STagMap::TByStr::const_iterator it = tm.m_ByStr.find(oid.GetStr());
2393 if (it != tm.m_ByStr.end()) {
2394 return it->second;
2395 }
2396 }
2397 else if ( oid.IsId() ) {
2398 STagMap::TById::const_iterator it = tm.m_ById.find(oid.GetId());
2399 if (it != tm.m_ById.end()) {
2400 return it->second;
2401 }
2402 }
2403 // Not found
2404 return 0;
2405 }
2406
2407
2408 static const size_t kMinGeneralStrDigits = 3;
2409
2410
FindInfo(const CSeq_id & id) const2411 CSeq_id_Handle CSeq_id_General_Tree::FindInfo(const CSeq_id& id) const
2412 {
2413 _ASSERT( id.IsGeneral() );
2414 const CDbtag& dbid = id.GetGeneral();
2415 if ( s_PackGeneralEnabled() ) {
2416 switch ( dbid.GetTag().Which() ) {
2417 case CObject_id::e_Str:
2418 {
2419 TPackedStrKey key = CSeq_id_General_Str_Info::Parse(dbid);
2420 if ( key.GetStrDigits() < kMinGeneralStrDigits ) {
2421 break;
2422 }
2423 TPacked packed = CSeq_id_General_Str_Info::Pack(key, dbid);
2424 TReadLockGuard guard(m_TreeLock);
2425 TPackedStrMap::const_iterator it = m_PackedStrMap.find(key);
2426 if ( it != m_PackedStrMap.end() ) {
2427 return CSeq_id_Handle(it->second, packed, it->first.ParseCaseVariant(dbid));
2428 }
2429 return null;
2430 }
2431 case CObject_id::e_Id:
2432 {
2433 const string& key = dbid.GetDb();
2434 TPacked packed = CSeq_id_General_Id_Info::Pack(key, dbid);
2435 TReadLockGuard guard(m_TreeLock);
2436 TPackedIdMap::const_iterator it = m_PackedIdMap.find(key);
2437 if ( it != m_PackedIdMap.end() ) {
2438 return CSeq_id_Handle(it->second, packed, s_ParseCaseVariant(it->first, dbid.GetDb()).first);
2439 }
2440 return null;
2441 }
2442 default:
2443 return null;
2444 }
2445 }
2446 TReadLockGuard guard(m_TreeLock);
2447 CSeq_id_General_PlainInfo* info = x_FindInfo(dbid);
2448 CSeq_id_Handle::TVariant variant = info? info->ParseCaseVariant(dbid): 0;
2449 return CSeq_id_Handle(info, 0, variant);
2450 }
2451
2452
FindOrCreate(const CSeq_id & id)2453 CSeq_id_Handle CSeq_id_General_Tree::FindOrCreate(const CSeq_id& id)
2454 {
2455 _ASSERT( id.IsGeneral() );
2456 const CDbtag& dbid = id.GetGeneral();
2457 if ( s_PackGeneralEnabled() ) {
2458 switch ( dbid.GetTag().Which() ) {
2459 case CObject_id::e_Str:
2460 {
2461 TPackedStrKey key = CSeq_id_General_Str_Info::Parse(dbid);
2462 if ( key.GetStrDigits() < kMinGeneralStrDigits ) {
2463 break;
2464 }
2465 TPacked packed = CSeq_id_General_Str_Info::Pack(key, dbid);
2466 TWriteLockGuard guard(m_TreeLock);
2467 TPackedStrMap::iterator it = m_PackedStrMap.find(key);
2468 if ( it == m_PackedStrMap.end() ) {
2469 CConstRef<CSeq_id_General_Str_Info> info
2470 (new CSeq_id_General_Str_Info(m_Mapper, key));
2471 m_PackedStrMap.insert(TPackedStrMap::value_type(key, info));
2472 // newly created ids have case variant bits all zeros
2473 return CSeq_id_Handle(info, packed, 0);
2474 }
2475 else {
2476 // determine case variant
2477 CSeq_id_Handle::TVariant variant = it->first.ParseCaseVariant(dbid);
2478 return CSeq_id_Handle(it->second, packed, variant);
2479 }
2480 }
2481 case CObject_id::e_Id:
2482 {
2483 const string& key = dbid.GetDb();
2484 TPacked packed = CSeq_id_General_Id_Info::Pack(key, dbid);
2485 TWriteLockGuard guard(m_TreeLock);
2486 TPackedIdMap::iterator it = m_PackedIdMap.lower_bound(key);
2487 CSeq_id_Handle::TVariant variant = 0;
2488 if ( it == m_PackedIdMap.end() ||
2489 !NStr::EqualNocase(it->first, key) ) {
2490 CConstRef<CSeq_id_General_Id_Info> info
2491 (new CSeq_id_General_Id_Info(m_Mapper, key));
2492 it = m_PackedIdMap.insert
2493 (it, TPackedIdMap::value_type(key, info));
2494 }
2495 else {
2496 variant = s_ParseCaseVariant(it->first, dbid.GetDb()).first;
2497 }
2498 return CSeq_id_Handle(it->second, packed, variant);
2499 }
2500 default:
2501 break;
2502 }
2503 }
2504 TWriteLockGuard guard(m_TreeLock);
2505 CSeq_id_General_PlainInfo* info = x_FindInfo(dbid);
2506 CSeq_id_Handle::TVariant variant = 0;
2507 if ( !info ) {
2508 info = new CSeq_id_General_PlainInfo(dbid, m_Mapper);
2509 STagMap& tm = m_DbMap[dbid.GetDb()];
2510 const CObject_id& oid = dbid.GetTag();
2511 if ( oid.IsStr() ) {
2512 //LOG_POST("CSeq_id_General_Tree::CreateStr("<<oid.GetStr()<<")");
2513 _VERIFY(tm.m_ByStr.insert
2514 (STagMap::TByStr::value_type(oid.GetStr(), info)).second);
2515 }
2516 else if ( oid.IsId() ) {
2517 //LOG_POST("CSeq_id_General_Tree::CreateStr("<<oid.GetId()<<")");
2518 _VERIFY(tm.m_ById.insert(STagMap::TById::value_type(oid.GetId(),
2519 info)).second);
2520 }
2521 else {
2522 NCBI_THROW(CSeq_id_MapperException, eEmptyError,
2523 "Can not create index for an empty db-tag");
2524 }
2525 }
2526 else {
2527 variant = info->ParseCaseVariant(dbid);
2528 }
2529 return CSeq_id_Handle(info, 0, variant);
2530 }
2531
2532
x_Unindex(const CSeq_id_Info * info)2533 void CSeq_id_General_Tree::x_Unindex(const CSeq_id_Info* info)
2534 {
2535 if ( !m_PackedStrMap.empty() ) {
2536 const CSeq_id_General_Str_Info* sinfo =
2537 dynamic_cast<const CSeq_id_General_Str_Info*>(info);
2538 if ( sinfo ) {
2539 m_PackedStrMap.erase(sinfo->GetKey());
2540 return;
2541 }
2542 }
2543 if ( !m_PackedIdMap.empty() ) {
2544 const CSeq_id_General_Id_Info* sinfo =
2545 dynamic_cast<const CSeq_id_General_Id_Info*>(info);
2546 if ( sinfo ) {
2547 m_PackedIdMap.erase(sinfo->GetKey());
2548 return;
2549 }
2550 }
2551
2552 CConstRef<CSeq_id> id = info->GetSeqId();
2553 _ASSERT( id->IsGeneral() );
2554 const CDbtag& dbid = id->GetGeneral();
2555
2556 TDbMap::iterator db_it = m_DbMap.find(dbid.GetDb());
2557 _ASSERT(db_it != m_DbMap.end());
2558 STagMap& tm = db_it->second;
2559 const CObject_id& oid = dbid.GetTag();
2560 if ( oid.IsStr() ) {
2561 _VERIFY(tm.m_ByStr.erase(oid.GetStr()));
2562 }
2563 else if ( oid.IsId() ) {
2564 _VERIFY(tm.m_ById.erase(oid.GetId()));
2565 }
2566 if (tm.m_ByStr.empty() && tm.m_ById.empty())
2567 m_DbMap.erase(db_it);
2568 }
2569
2570
HaveMatch(const CSeq_id_Handle & id) const2571 bool CSeq_id_General_Tree::HaveMatch(const CSeq_id_Handle& id) const
2572 {
2573 // match id <-> str(number)
2574 if ( !m_PackedStrMap.empty() ) {
2575 const CSeq_id_General_Str_Info* sinfo =
2576 dynamic_cast<const CSeq_id_General_Str_Info*>(id.x_GetInfo());
2577 if ( sinfo ) {
2578 // string with non-digital prefix or suffix
2579 // cannot be converted to numeric id
2580 if ( !sinfo->GetStrSuffix().empty() ||
2581 !sx_AllDigits(sinfo->GetStrPrefix()) ) {
2582 return false;
2583 }
2584 }
2585 }
2586 return true;
2587 }
2588
2589
FindMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list) const2590 void CSeq_id_General_Tree::FindMatch(const CSeq_id_Handle& id,
2591 TSeq_id_MatchList& id_list) const
2592 {
2593 id_list.insert(id);
2594 // match id <-> str(number)
2595 if ( !m_PackedStrMap.empty() ) {
2596 const CSeq_id_General_Str_Info* sinfo =
2597 dynamic_cast<const CSeq_id_General_Str_Info*>(id.x_GetInfo());
2598 if ( sinfo ) {
2599 // string with non-digital prefix or suffix
2600 // cannot be converted to numeric id
2601 if ( !sinfo->GetStrSuffix().empty() ||
2602 !sx_AllDigits(sinfo->GetStrPrefix()) ) {
2603 return;
2604 }
2605 }
2606 }
2607 CConstRef<CSeq_id> seq_id = id.GetSeqId();
2608 const CDbtag& dbtag = seq_id->GetGeneral();
2609 const CObject_id& obj_id = dbtag.GetTag();
2610 if ( obj_id.IsId() ) {
2611 int n = obj_id.GetId();
2612 if ( n >= 0 ) {
2613 CSeq_id seq_id2;
2614 CDbtag& dbtag2 = seq_id2.SetGeneral();
2615 dbtag2.SetDb(dbtag.GetDb());
2616 dbtag2.SetTag().SetStr(NStr::IntToString(n));
2617 CSeq_id_Handle id2 = FindInfo(seq_id2);
2618 if ( id2 ) {
2619 id_list.insert(id2);
2620 }
2621 }
2622 }
2623 else {
2624 const string& s = obj_id.GetStr();
2625 int n = NStr::StringToNonNegativeInt(s);
2626 if ( n >= 0 && NStr::IntToString(n) == s ) {
2627 CSeq_id seq_id2;
2628 CDbtag& dbtag2 = seq_id2.SetGeneral();
2629 dbtag2.SetDb(dbtag.GetDb());
2630 dbtag2.SetTag().SetId(n);
2631 CSeq_id_Handle id2 = FindInfo(seq_id2);
2632 if ( id2 ) {
2633 id_list.insert(id2);
2634 }
2635 }
2636 }
2637 }
2638
2639
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const2640 void CSeq_id_General_Tree::FindMatchStr(const string& sid,
2641 TSeq_id_MatchList& id_list) const
2642 {
2643 TPacked value;
2644 bool ok;
2645 try {
2646 value = NStr::StringToNumeric<TPacked>(sid);
2647 ok = true;
2648 }
2649 catch (const CStringException&) {
2650 // Not an integer value
2651 value = -1;
2652 ok = false;
2653 }
2654 TReadLockGuard guard(m_TreeLock);
2655 ITERATE(TDbMap, db_it, m_DbMap) {
2656 // In any case search in strings
2657 STagMap::TByStr::const_iterator str_it =
2658 db_it->second.m_ByStr.find(sid);
2659 if (str_it != db_it->second.m_ByStr.end()) {
2660 id_list.insert(CSeq_id_Handle(str_it->second));
2661 }
2662 if ( ok ) {
2663 STagMap::TById::const_iterator int_it =
2664 db_it->second.m_ById.find(value);
2665 if (int_it != db_it->second.m_ById.end()) {
2666 id_list.insert(CSeq_id_Handle(int_it->second));
2667 }
2668 }
2669 }
2670 }
2671
2672
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const2673 size_t CSeq_id_General_Tree::Dump(CNcbiOstream& out,
2674 CSeq_id::E_Choice type,
2675 int details) const
2676 {
2677 size_t total_bytes = 0;
2678 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2679 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): "<<endl;
2680 }
2681 {{ // m_DbMap
2682 size_t count = 0, bytes = 0;
2683 ITERATE ( TDbMap, it, m_DbMap ) {
2684 bytes += sizeof(string)+sizeof(STagMap); // map value
2685 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
2686 // malloc overhead:
2687 // map value
2688 bytes += 1*kMallocOverhead;
2689 bytes += sx_StringMemory(it->first);
2690 ITERATE ( STagMap::TById, it2, it->second.m_ById ) {
2691 count += 1;
2692 bytes += sizeof(it2->first)+sizeof(it2->second); // map
2693 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
2694 bytes += sizeof(CSeq_id_Info);
2695 bytes += sizeof(CSeq_id);
2696 bytes += sizeof(CObject_id);
2697 bytes += 4*kMallocOverhead;
2698 }
2699 ITERATE ( STagMap::TByStr, it2, it->second.m_ByStr ) {
2700 count += 1;
2701 bytes += sizeof(it2->first)+sizeof(it2->second); // map
2702 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
2703 bytes += sizeof(CSeq_id_Info);
2704 bytes += sizeof(CSeq_id);
2705 bytes += sizeof(CObject_id);
2706 bytes += 4*kMallocOverhead;
2707 bytes += sx_StringMemory(it2->first);
2708 }
2709 }
2710 total_bytes += bytes;
2711 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2712 out << " "<<count << " handles, "<<bytes<<" bytes" << endl;
2713 }
2714 }}
2715 {{ // m_PackedIdMap
2716 size_t count = m_PackedIdMap.size(), elem_size = 0, extra_size = 0;
2717 if ( count ) {
2718 elem_size = sizeof(TPackedIdKey)+sizeof(void*);
2719 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
2720 elem_size += sizeof(CSeq_id_General_Id_Info); //
2721 // malloc overhead:
2722 // map value, CSeq_id_General_Id_Info
2723 elem_size += 2*kMallocOverhead;
2724 ITERATE ( TPackedIdMap, it, m_PackedIdMap ) {
2725 extra_size += sx_StringMemory(it->first);
2726 }
2727 }
2728 size_t bytes = extra_size + count*elem_size;
2729 total_bytes += bytes;
2730 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2731 out << " "<<count << " packed int handles, "<<bytes<<" bytes" << endl;
2732 }
2733 }}
2734 {{ // m_PackedStrMap
2735 size_t count = m_PackedStrMap.size(), elem_size = 0, extra_size = 0;
2736 if ( count ) {
2737 elem_size = sizeof(TPackedIdKey)+sizeof(void*);
2738 elem_size += sizeof(int)+3*sizeof(void*); // red/black tree
2739 elem_size += sizeof(CSeq_id_General_Str_Info); //
2740 // malloc overhead:
2741 // map value, CSeq_id_General_Id_Info
2742 elem_size += 2*kMallocOverhead;
2743 ITERATE ( TPackedStrMap, it, m_PackedStrMap ) {
2744 extra_size += sx_StringMemory(it->first.m_Db);
2745 extra_size += sx_StringMemory(it->first.m_StrPrefix);
2746 extra_size += sx_StringMemory(it->first.m_StrSuffix);
2747 }
2748 }
2749 size_t bytes = extra_size + count*elem_size;
2750 total_bytes += bytes;
2751 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2752 out << " "<<count << " packed str handles, "<<bytes<<" bytes" << endl;
2753 }
2754 }}
2755 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
2756 ITERATE ( TDbMap, it, m_DbMap ) {
2757 ITERATE ( STagMap::TByStr, it2, it->second.m_ByStr ) {
2758 out << " "<<it2->second->GetSeqId()->AsFastaString() << endl;
2759 }
2760 ITERATE ( STagMap::TByStr, it2, it->second.m_ByStr ) {
2761 out << " "<<it2->second->GetSeqId()->AsFastaString() << endl;
2762 }
2763 }
2764 ITERATE ( TPackedIdMap, it, m_PackedIdMap ) {
2765 out << " packed int "<<it->first << endl;
2766 }
2767 ITERATE ( TPackedStrMap, it, m_PackedStrMap ) {
2768 out << " packed str "<<it->first.m_Key<<"/"<<it->first.m_Db<<"/"
2769 <<it->first.m_StrPrefix<<"/"<<it->first.m_StrSuffix << endl;
2770 }
2771 }
2772 return total_bytes;
2773 }
2774
2775 /////////////////////////////////////////////////////////////////////////////
2776 // CSeq_id_Giim_Tree
2777 /////////////////////////////////////////////////////////////////////////////
2778
2779
CSeq_id_Giim_Tree(CSeq_id_Mapper * mapper)2780 CSeq_id_Giim_Tree::CSeq_id_Giim_Tree(CSeq_id_Mapper* mapper)
2781 : CSeq_id_Which_Tree(mapper)
2782 {
2783 }
2784
2785
~CSeq_id_Giim_Tree(void)2786 CSeq_id_Giim_Tree::~CSeq_id_Giim_Tree(void)
2787 {
2788 }
2789
2790
Empty(void) const2791 bool CSeq_id_Giim_Tree::Empty(void) const
2792 {
2793 return m_IdMap.empty();
2794 }
2795
2796
x_FindInfo(const CGiimport_id & gid) const2797 CSeq_id_Info* CSeq_id_Giim_Tree::x_FindInfo(const CGiimport_id& gid) const
2798 {
2799 TIdMap::const_iterator id_it = m_IdMap.find(gid.GetId());
2800 if (id_it == m_IdMap.end())
2801 return 0;
2802 ITERATE (TGiimList, dbr_it, id_it->second) {
2803 CConstRef<CSeq_id> id = (*dbr_it)->GetSeqId();
2804 const CGiimport_id& gid2 = id->GetGiim();
2805 // Both Db and Release must be equal
2806 if ( !gid.Equals(gid2) ) {
2807 return *dbr_it;
2808 }
2809 }
2810 // Not found
2811 return 0;
2812 }
2813
2814
FindInfo(const CSeq_id & id) const2815 CSeq_id_Handle CSeq_id_Giim_Tree::FindInfo(const CSeq_id& id) const
2816 {
2817 _ASSERT( id.IsGiim() );
2818 const CGiimport_id& gid = id.GetGiim();
2819 TReadLockGuard guard(m_TreeLock);
2820 return CSeq_id_Handle(x_FindInfo(gid));
2821 }
2822
2823
FindOrCreate(const CSeq_id & id)2824 CSeq_id_Handle CSeq_id_Giim_Tree::FindOrCreate(const CSeq_id& id)
2825 {
2826 _ASSERT( id.IsGiim() );
2827 const CGiimport_id& gid = id.GetGiim();
2828 TWriteLockGuard guard(m_TreeLock);
2829 CSeq_id_Info* info = x_FindInfo(gid);
2830 if ( !info ) {
2831 info = CreateInfo(id);
2832 m_IdMap[gid.GetId()].push_back(info);
2833 }
2834 return CSeq_id_Handle(info);
2835 }
2836
2837
x_Unindex(const CSeq_id_Info * info)2838 void CSeq_id_Giim_Tree::x_Unindex(const CSeq_id_Info* info)
2839 {
2840 CConstRef<CSeq_id> id = info->GetSeqId();
2841 _ASSERT( id->IsGiim() );
2842 const CGiimport_id& gid = id->GetGiim();
2843
2844 TIdMap::iterator id_it = m_IdMap.find(gid.GetId());
2845 _ASSERT(id_it != m_IdMap.end());
2846 TGiimList& giims = id_it->second;
2847 NON_CONST_ITERATE(TGiimList, dbr_it, giims) {
2848 if (*dbr_it == info) {
2849 giims.erase(dbr_it);
2850 break;
2851 }
2852 }
2853 if ( giims.empty() )
2854 m_IdMap.erase(id_it);
2855 }
2856
2857
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const2858 void CSeq_id_Giim_Tree::FindMatchStr(const string& sid,
2859 TSeq_id_MatchList& id_list) const
2860 {
2861 TReadLockGuard guard(m_TreeLock);
2862 try {
2863 TPacked value = NStr::StringToNumeric<TPacked>(sid);
2864 TIdMap::const_iterator it = m_IdMap.find(value);
2865 if (it == m_IdMap.end())
2866 return;
2867 ITERATE(TGiimList, git, it->second) {
2868 id_list.insert(CSeq_id_Handle(*git));
2869 }
2870 }
2871 catch (CStringException) {
2872 // Not an integer value
2873 return;
2874 }
2875 }
2876
2877
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const2878 size_t CSeq_id_Giim_Tree::Dump(CNcbiOstream& out,
2879 CSeq_id::E_Choice type,
2880 int details) const
2881 {
2882 size_t total_bytes = 0;
2883 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2884 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
2885 }
2886 size_t count = 0, bytes = 0;
2887 ITERATE ( TIdMap, it, m_IdMap ) {
2888 bytes += sizeof(it->first) + sizeof(it->second);
2889 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
2890 // malloc overhead:
2891 // map value, vector
2892 bytes += 2*kMallocOverhead;
2893 size_t size2 = it->second.size();
2894 count += size2;
2895 bytes += it->second.capacity()*sizeof(void*);
2896 bytes += size2*sizeof(CSeq_id_Info);
2897 bytes += size2*sizeof(CSeq_id);
2898 bytes += size2*sizeof(CGiimport_id);
2899 ITERATE ( TGiimList, it2, it->second ) {
2900 const CGiimport_id& id = (*it2)->GetSeqId()->GetGiim();
2901 if ( id.IsSetDb() ) {
2902 bytes += sx_StringMemory(id.GetDb());
2903 }
2904 if ( id.IsSetRelease() ) {
2905 bytes += sx_StringMemory(id.GetRelease());
2906 }
2907 }
2908 }
2909 total_bytes += bytes;
2910 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
2911 out << count << " handles, "<<bytes<<" bytes" << endl;
2912 }
2913 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
2914 ITERATE ( TIdMap, it, m_IdMap ) {
2915 ITERATE ( TGiimList, it2, it->second ) {
2916 out << " "<<(*it2)->GetSeqId()->AsFastaString() << endl;
2917 }
2918 }
2919 }
2920 return total_bytes;
2921 }
2922
2923 /////////////////////////////////////////////////////////////////////////////
2924 // CSeq_id_Patent_Tree
2925 /////////////////////////////////////////////////////////////////////////////
2926
2927
CSeq_id_Patent_Tree(CSeq_id_Mapper * mapper)2928 CSeq_id_Patent_Tree::CSeq_id_Patent_Tree(CSeq_id_Mapper* mapper)
2929 : CSeq_id_Which_Tree(mapper)
2930 {
2931 }
2932
2933
~CSeq_id_Patent_Tree(void)2934 CSeq_id_Patent_Tree::~CSeq_id_Patent_Tree(void)
2935 {
2936 }
2937
2938
Empty(void) const2939 bool CSeq_id_Patent_Tree::Empty(void) const
2940 {
2941 return m_CountryMap.empty();
2942 }
2943
2944
x_FindInfo(const CPatent_seq_id & pid) const2945 CSeq_id_Info* CSeq_id_Patent_Tree::x_FindInfo(const CPatent_seq_id& pid) const
2946 {
2947 const CId_pat& cit = pid.GetCit();
2948 TByCountry::const_iterator cntry_it = m_CountryMap.find(cit.GetCountry());
2949 if (cntry_it == m_CountryMap.end())
2950 return 0;
2951
2952 const string* number;
2953 const SPat_idMap::TByNumber* by_number;
2954 if ( cit.GetId().IsNumber() ) {
2955 number = &cit.GetId().GetNumber();
2956 by_number = &cntry_it->second.m_ByNumber;
2957 }
2958 else if ( cit.GetId().IsApp_number() ) {
2959 number = &cit.GetId().GetApp_number();
2960 by_number = &cntry_it->second.m_ByApp_number;
2961 }
2962 else {
2963 return 0;
2964 }
2965
2966 SPat_idMap::TByNumber::const_iterator num_it = by_number->find(*number);
2967 if (num_it == by_number->end())
2968 return 0;
2969 SPat_idMap::TBySeqid::const_iterator seqid_it =
2970 num_it->second.find(pid.GetSeqid());
2971 if (seqid_it != num_it->second.end()) {
2972 return seqid_it->second;
2973 }
2974 // Not found
2975 return 0;
2976 }
2977
2978
FindInfo(const CSeq_id & id) const2979 CSeq_id_Handle CSeq_id_Patent_Tree::FindInfo(const CSeq_id& id) const
2980 {
2981 _ASSERT( id.IsPatent() );
2982 const CPatent_seq_id& pid = id.GetPatent();
2983 TReadLockGuard guard(m_TreeLock);
2984 return CSeq_id_Handle(x_FindInfo(pid));
2985 }
2986
FindOrCreate(const CSeq_id & id)2987 CSeq_id_Handle CSeq_id_Patent_Tree::FindOrCreate(const CSeq_id& id)
2988 {
2989 _ASSERT( id.IsPatent() );
2990 const CPatent_seq_id& pid = id.GetPatent();
2991 TWriteLockGuard guard(m_TreeLock);
2992 CSeq_id_Info* info = x_FindInfo(pid);
2993 if ( !info ) {
2994 const CId_pat& cit = pid.GetCit();
2995 SPat_idMap& country = m_CountryMap[cit.GetCountry()];
2996 if ( cit.GetId().IsNumber() ) {
2997 SPat_idMap::TBySeqid& num =
2998 country.m_ByNumber[cit.GetId().GetNumber()];
2999 _ASSERT(num.find(pid.GetSeqid()) == num.end());
3000 info = CreateInfo(id);
3001 num[pid.GetSeqid()] = info;
3002 }
3003 else if ( cit.GetId().IsApp_number() ) {
3004 SPat_idMap::TBySeqid& app = country.m_ByApp_number[
3005 cit.GetId().GetApp_number()];
3006 _ASSERT(app.find(pid.GetSeqid()) == app.end());
3007 info = CreateInfo(id);
3008 app[pid.GetSeqid()] = info;
3009 }
3010 else {
3011 // Can not index empty patent number
3012 NCBI_THROW(CSeq_id_MapperException, eEmptyError,
3013 "Cannot index empty patent number");
3014 }
3015 }
3016 return CSeq_id_Handle(info);
3017 }
3018
3019
x_Unindex(const CSeq_id_Info * info)3020 void CSeq_id_Patent_Tree::x_Unindex(const CSeq_id_Info* info)
3021 {
3022 CConstRef<CSeq_id> id = info->GetSeqId();
3023 _ASSERT( id->IsPatent() );
3024 const CPatent_seq_id& pid = id->GetPatent();
3025
3026 TByCountry::iterator country_it =
3027 m_CountryMap.find(pid.GetCit().GetCountry());
3028 _ASSERT(country_it != m_CountryMap.end());
3029 SPat_idMap& pats = country_it->second;
3030 if ( pid.GetCit().GetId().IsNumber() ) {
3031 SPat_idMap::TByNumber::iterator num_it =
3032 pats.m_ByNumber.find(pid.GetCit().GetId().GetNumber());
3033 _ASSERT(num_it != pats.m_ByNumber.end());
3034 SPat_idMap::TBySeqid::iterator seqid_it =
3035 num_it->second.find(pid.GetSeqid());
3036 _ASSERT(seqid_it != num_it->second.end());
3037 _ASSERT(seqid_it->second == info);
3038 num_it->second.erase(seqid_it);
3039 if ( num_it->second.empty() )
3040 pats.m_ByNumber.erase(num_it);
3041 }
3042 else if ( pid.GetCit().GetId().IsApp_number() ) {
3043 SPat_idMap::TByNumber::iterator app_it =
3044 pats.m_ByApp_number.find(pid.GetCit().GetId().GetApp_number());
3045 _ASSERT( app_it != pats.m_ByApp_number.end() );
3046 SPat_idMap::TBySeqid::iterator seqid_it =
3047 app_it->second.find(pid.GetSeqid());
3048 _ASSERT(seqid_it != app_it->second.end());
3049 _ASSERT(seqid_it->second == info);
3050 app_it->second.erase(seqid_it);
3051 if ( app_it->second.empty() )
3052 pats.m_ByApp_number.erase(app_it);
3053 }
3054 if (country_it->second.m_ByNumber.empty() &&
3055 country_it->second.m_ByApp_number.empty())
3056 m_CountryMap.erase(country_it);
3057 }
3058
3059
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const3060 void CSeq_id_Patent_Tree::FindMatchStr(const string& sid,
3061 TSeq_id_MatchList& id_list) const
3062 {
3063 TReadLockGuard guard(m_TreeLock);
3064 ITERATE (TByCountry, cit, m_CountryMap) {
3065 SPat_idMap::TByNumber::const_iterator nit =
3066 cit->second.m_ByNumber.find(sid);
3067 if (nit != cit->second.m_ByNumber.end()) {
3068 ITERATE(SPat_idMap::TBySeqid, iit, nit->second) {
3069 id_list.insert(CSeq_id_Handle(iit->second));
3070 }
3071 }
3072 SPat_idMap::TByNumber::const_iterator ait =
3073 cit->second.m_ByApp_number.find(sid);
3074 if (ait != cit->second.m_ByApp_number.end()) {
3075 ITERATE(SPat_idMap::TBySeqid, iit, nit->second) {
3076 id_list.insert(CSeq_id_Handle(iit->second));
3077 }
3078 }
3079 }
3080 }
3081
3082
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const3083 size_t CSeq_id_Patent_Tree::Dump(CNcbiOstream& out,
3084 CSeq_id::E_Choice type,
3085 int details) const
3086 {
3087 size_t total_bytes = 0;
3088 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
3089 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
3090 }
3091 size_t count = 0, bytes = 0;
3092 ITERATE ( TByCountry, it, m_CountryMap ) {
3093 bytes += sizeof(it->first) + sizeof(it->second);
3094 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
3095 // malloc overhead:
3096 // map value, vector
3097 bytes += 1*kMallocOverhead;
3098 bytes += sx_StringMemory(it->first);
3099 ITERATE ( SPat_idMap::TByNumber, it2, it->second.m_ByNumber ) {
3100 bytes += sizeof(it2->first) + sizeof(it2->second);
3101 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
3102 // malloc overhead:
3103 // map value, vector
3104 bytes += 1*kMallocOverhead;
3105 bytes += sx_StringMemory(it2->first);
3106 ITERATE ( SPat_idMap::TBySeqid, it3, it2->second ) {
3107 count += 1;
3108 bytes += sizeof(it2->first) + sizeof(it2->second);
3109 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
3110 bytes += sizeof(CSeq_id_Info);
3111 bytes += sizeof(CSeq_id);
3112 bytes += sizeof(CPatent_seq_id);
3113 bytes += sizeof(CId_pat);
3114 // malloc overhead:
3115 // map value,
3116 bytes += 5*kMallocOverhead;
3117 }
3118 }
3119 }
3120 total_bytes += bytes;
3121 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
3122 out << count << " handles, "<<bytes<<" bytes" << endl;
3123 }
3124 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
3125 ITERATE ( TByCountry, it, m_CountryMap ) {
3126 ITERATE ( SPat_idMap::TByNumber, it2, it->second.m_ByNumber ) {
3127 ITERATE ( SPat_idMap::TBySeqid, it3, it2->second ) {
3128 out << " "<<it3->second->GetSeqId()->AsFastaString() << endl;
3129 }
3130 }
3131 }
3132 }
3133 return total_bytes;
3134 }
3135
3136 /////////////////////////////////////////////////////////////////////////////
3137 // CSeq_id_PDB_Tree
3138 /////////////////////////////////////////////////////////////////////////////
3139
3140
CSeq_id_PDB_Tree(CSeq_id_Mapper * mapper)3141 CSeq_id_PDB_Tree::CSeq_id_PDB_Tree(CSeq_id_Mapper* mapper)
3142 : CSeq_id_Which_Tree(mapper)
3143 {
3144 }
3145
3146
~CSeq_id_PDB_Tree(void)3147 CSeq_id_PDB_Tree::~CSeq_id_PDB_Tree(void)
3148 {
3149 }
3150
3151
Empty(void) const3152 bool CSeq_id_PDB_Tree::Empty(void) const
3153 {
3154 return m_MolMap.empty();
3155 }
3156
3157
x_IdToStrKey(const CPDB_seq_id & id) const3158 inline string CSeq_id_PDB_Tree::x_IdToStrKey(const CPDB_seq_id& id) const
3159 {
3160 // this is an attempt to follow the undocumented rules of PDB
3161 // ("documented" as code written elsewhere)
3162 string skey = id.GetMol().Get();
3163 if (id.IsSetChain_id()) {
3164 skey += '_';
3165 skey += id.GetChain_id();
3166 }
3167 else if (id.IsSetChain()) {
3168 skey += '_';
3169 skey += char(id.GetChain());
3170 }
3171 return skey;
3172 }
3173
3174
FindInfo(const CSeq_id & id) const3175 CSeq_id_Handle CSeq_id_PDB_Tree::FindInfo(const CSeq_id& id) const
3176 {
3177 _ASSERT( id.IsPdb() );
3178 const CPDB_seq_id& pid = id.GetPdb();
3179 TReadLockGuard guard(m_TreeLock);
3180 TMolMap::const_iterator mol_it = m_MolMap.find(x_IdToStrKey(pid));
3181 if ( mol_it != m_MolMap.end() ) {
3182 ITERATE( TSubMolList, it, mol_it->second ) {
3183 if ( pid.Equals((*it)->GetSeqId()->GetPdb()) ) {
3184 return CSeq_id_Handle(*it);
3185 }
3186 }
3187 }
3188 return CSeq_id_Handle();
3189 }
3190
3191
FindOrCreate(const CSeq_id & id)3192 CSeq_id_Handle CSeq_id_PDB_Tree::FindOrCreate(const CSeq_id& id)
3193 {
3194 _ASSERT( id.IsPdb() );
3195 const CPDB_seq_id& pid = id.GetPdb();
3196 TWriteLockGuard guard(m_TreeLock);
3197 TSubMolList& sub = m_MolMap[x_IdToStrKey(id.GetPdb())];
3198 ITERATE ( TSubMolList, it, sub ) {
3199 if ( pid.Equals((*it)->GetSeqId()->GetPdb()) ) {
3200 return CSeq_id_Handle(*it);
3201 }
3202 }
3203 CSeq_id_Info* info = CreateInfo(id);
3204 sub.push_back(info);
3205 return CSeq_id_Handle(info);
3206 }
3207
3208
x_Unindex(const CSeq_id_Info * info)3209 void CSeq_id_PDB_Tree::x_Unindex(const CSeq_id_Info* info)
3210 {
3211 CConstRef<CSeq_id> id = info->GetSeqId();
3212 _ASSERT( id->IsPdb() );
3213 const CPDB_seq_id& pid = id->GetPdb();
3214
3215 TMolMap::iterator mol_it = m_MolMap.find(x_IdToStrKey(pid));
3216 _ASSERT(mol_it != m_MolMap.end());
3217 NON_CONST_ITERATE(TSubMolList, it, mol_it->second) {
3218 if (*it == info) {
3219 _ASSERT(pid.Equals((*it)->GetSeqId()->GetPdb()));
3220 mol_it->second.erase(it);
3221 break;
3222 }
3223 }
3224 if ( mol_it->second.empty() )
3225 m_MolMap.erase(mol_it);
3226 }
3227
3228
HaveMatch(const CSeq_id_Handle &) const3229 bool CSeq_id_PDB_Tree::HaveMatch(const CSeq_id_Handle& ) const
3230 {
3231 return true;
3232 }
3233
3234
FindMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list) const3235 void CSeq_id_PDB_Tree::FindMatch(const CSeq_id_Handle& id,
3236 TSeq_id_MatchList& id_list) const
3237 {
3238 //_ASSERT(id && id == FindInfo(id.GetSeqId()));
3239 CConstRef<CSeq_id> seq_id = id.GetSeqId();
3240 const CPDB_seq_id& pid = seq_id->GetPdb();
3241 TReadLockGuard guard(m_TreeLock);
3242 TMolMap::const_iterator mol_it = m_MolMap.find(x_IdToStrKey(pid));
3243 if (mol_it == m_MolMap.end())
3244 return;
3245 ITERATE(TSubMolList, it, mol_it->second) {
3246 const CPDB_seq_id& pid2 = (*it)->GetSeqId()->GetPdb();
3247 // Ignore date if not set in id
3248 if ( pid.IsSetRel() ) {
3249 if ( !pid2.IsSetRel() ||
3250 !pid.GetRel().Equals(pid2.GetRel()) )
3251 continue;
3252 }
3253 id_list.insert(CSeq_id_Handle(*it));
3254 }
3255 }
3256
3257
FindMatchStr(const string & sid,TSeq_id_MatchList & id_list) const3258 void CSeq_id_PDB_Tree::FindMatchStr(const string& sid,
3259 TSeq_id_MatchList& id_list) const
3260 {
3261 TReadLockGuard guard(m_TreeLock);
3262 TMolMap::const_iterator mit = m_MolMap.find(sid);
3263 if (mit == m_MolMap.end())
3264 return;
3265 ITERATE(TSubMolList, sub_it, mit->second) {
3266 id_list.insert(CSeq_id_Handle(*sub_it));
3267 }
3268 }
3269
3270
HaveReverseMatch(const CSeq_id_Handle &) const3271 bool CSeq_id_PDB_Tree::HaveReverseMatch(const CSeq_id_Handle& ) const
3272 {
3273 return true;
3274 }
3275
3276
FindReverseMatch(const CSeq_id_Handle & id,TSeq_id_MatchList & id_list)3277 void CSeq_id_PDB_Tree::FindReverseMatch(const CSeq_id_Handle& id,
3278 TSeq_id_MatchList& id_list)
3279 {
3280 //_ASSERT(id && id == FindInfo(id.GetSeqId()));
3281 id_list.insert(id);
3282 CConstRef<CSeq_id> seq_id = id.GetSeqId();
3283 const CPDB_seq_id& pid = seq_id->GetPdb();
3284 if ( !pid.IsSetRel() )
3285 return;
3286 // find id without release date
3287 TReadLockGuard guard(m_TreeLock);
3288 TMolMap::const_iterator mol_it = m_MolMap.find(x_IdToStrKey(pid));
3289 if (mol_it == m_MolMap.end())
3290 return;
3291 ITERATE(TSubMolList, it, mol_it->second) {
3292 const CPDB_seq_id& pid2 = (*it)->GetSeqId()->GetPdb();
3293 // Ignore date if set in id
3294 if ( pid2.IsSetRel() )
3295 continue;
3296 id_list.insert(CSeq_id_Handle(*it));
3297 }
3298 }
3299
3300
Dump(CNcbiOstream & out,CSeq_id::E_Choice type,int details) const3301 size_t CSeq_id_PDB_Tree::Dump(CNcbiOstream& out,
3302 CSeq_id::E_Choice type,
3303 int details) const
3304 {
3305 size_t total_bytes = 0;
3306 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
3307 out << "CSeq_id_Handles("<<CSeq_id::SelectionName(type)<<"): ";
3308 }
3309 size_t count = 0, bytes = 0;
3310 ITERATE ( TMolMap, it, m_MolMap ) {
3311 bytes += sizeof(it->first) + sizeof(it->second);
3312 bytes += sizeof(int)+3*sizeof(void*); // red/black tree
3313 // malloc overhead:
3314 // map value, vector
3315 bytes += 2*kMallocOverhead;
3316 bytes += sx_StringMemory(it->first);
3317 size_t size2 = it->second.size();
3318 count += size2;
3319 bytes += it->second.capacity()*sizeof(void*);
3320 bytes += size2*sizeof(CSeq_id_Info);
3321 bytes += size2*sizeof(CSeq_id);
3322 bytes += size2*sizeof(CPDB_seq_id);
3323 ITERATE ( TSubMolList, it2, it->second ) {
3324 const CPDB_seq_id& id = (*it2)->GetSeqId()->GetPdb();
3325 if ( id.IsSetRel() ) {
3326 bytes += sizeof(CDate);
3327 bytes += kMallocOverhead;
3328 }
3329 }
3330 }
3331 total_bytes += bytes;
3332 if ( details >= CSeq_id_Mapper::eDumpStatistics ) {
3333 out << count << " handles, "<<bytes<<" bytes" << endl;
3334 }
3335 if ( details >= CSeq_id_Mapper::eDumpAllIds ) {
3336 ITERATE ( TMolMap, it, m_MolMap ) {
3337 ITERATE ( TSubMolList, it2, it->second ) {
3338 out << " "<<(*it2)->GetSeqId()->AsFastaString() << endl;
3339 }
3340 }
3341 }
3342 return total_bytes;
3343 }
3344
3345
GetErrCodeString(void) const3346 const char* CSeq_id_MapperException::GetErrCodeString(void) const
3347 {
3348 switch ( GetErrCode() ) {
3349 case eTypeError: return "eTypeError";
3350 case eSymbolError: return "eSymbolError";
3351 case eEmptyError: return "eEmptyError";
3352 case eOtherError: return "eOtherError";
3353 default: return CException::GetErrCodeString();
3354 }
3355 }
3356
3357
3358 END_SCOPE(objects)
3359 END_NCBI_SCOPE
3360