1 #ifndef OBJECTS_OBJMGR___SEQ_ID_TREE__HPP 2 #define OBJECTS_OBJMGR___SEQ_ID_TREE__HPP 3 4 /* $Id: seq_id_tree.hpp 629051 2021-04-09 11:36:07Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Aleksey Grichenko, Eugene Vasilchenko 30 * 31 * File Description: 32 * Seq-id mapper for Object Manager 33 * 34 */ 35 36 #include <corelib/ncbiobj.hpp> 37 #include <corelib/ncbimtx.hpp> 38 #include <corelib/ncbistr.hpp> 39 #include <corelib/ncbi_limits.hpp> 40 41 #include <objects/general/Date.hpp> 42 #include <objects/general/Dbtag.hpp> 43 #include <objects/general/Object_id.hpp> 44 45 #include <objects/biblio/Id_pat.hpp> 46 47 #include <objects/seqloc/Seq_id.hpp> 48 #include <objects/seqloc/PDB_mol_id.hpp> 49 #include <objects/seqloc/PDB_seq_id.hpp> 50 #include <objects/seqloc/Patent_seq_id.hpp> 51 #include <objects/seqloc/Giimport_id.hpp> 52 #include <objects/seqloc/Textseq_id.hpp> 53 54 #include <objects/seq/seq_id_handle.hpp> 55 56 #include <vector> 57 #include <set> 58 #include <map> 59 #include <unordered_map> 60 61 BEGIN_NCBI_SCOPE 62 BEGIN_SCOPE(objects) 63 64 65 class CSeq_id; 66 class CSeq_id_Handle; 67 class CSeq_id_Info; 68 class CSeq_id_Mapper; 69 class CSeq_id_Which_Tree; 70 71 struct PHashNocase { get_hashPHashNocase72 static char get_hash(char c) 73 { 74 // In ids only ASCII characters are allowed, and in ASCII 75 // upper and lower cases differ only by one bit. 76 // So for efficiency it's enough to reset that bit 77 // instead of using more complex tolower(). 78 return c&~32; 79 //return tolower(c); 80 } operator ()PHashNocase81 size_t operator()(const string& s) const 82 { 83 size_t h = s.size(); 84 for ( auto c : s ) 85 h = h*17 + get_hash(c); 86 return h; 87 } 88 }; 89 struct PEqualNocase { operator ()PEqualNocase90 bool operator()(const string& s1, const string& s2) const 91 { 92 // in most cases letter cases match, 93 // so it's faster first to check that 94 // with more efficient direct string comparison 95 if ( s1 == s2 ) { 96 return true; 97 } 98 // otherwise we first check if lengths are the same 99 size_t len = s1.size(); 100 if ( s2.size() != len ) { 101 return false; 102 } 103 for ( size_t i = 0; i < len; ++i ) { 104 char c1 = s1[i]; 105 char c2 = s2[i]; 106 if ( tolower((unsigned char)c1) != tolower((unsigned char)c2) ) { 107 return false; 108 } 109 } 110 return true; 111 // commented out old less efficient comparison 112 //return NStr::EqualNocase(s1, s2); 113 } 114 }; 115 116 //////////////////////////////////////////////////////////////////// 117 // 118 // CSeq_id_***_Tree:: 119 // 120 // Seq-id sub-type specific trees 121 // 122 123 124 // Base class for seq-id type-specific trees 125 class CSeq_id_Which_Tree : public CObject 126 { 127 public: 128 // 'ctors 129 CSeq_id_Which_Tree(CSeq_id_Mapper* mapper); 130 virtual ~CSeq_id_Which_Tree(void); 131 132 static void Initialize(CSeq_id_Mapper* mapper, 133 vector<CRef<CSeq_id_Which_Tree> >& v); 134 135 virtual bool Empty(void) const = 0; 136 137 // Find exaclty the same seq-id 138 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const = 0; 139 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id) = 0; 140 virtual CSeq_id_Handle GetGiHandle(TGi gi); 141 142 virtual void DropInfo(const CSeq_id_Info* info); 143 144 typedef set<CSeq_id_Handle> TSeq_id_MatchList; 145 146 // Get the list of matching seq-id. 147 virtual bool HaveMatch(const CSeq_id_Handle& id) const; 148 virtual void FindMatch(const CSeq_id_Handle& id, 149 TSeq_id_MatchList& id_list) const; 150 virtual void FindMatchStr(const string& sid, 151 TSeq_id_MatchList& id_list) const = 0; 152 153 // returns true if FindMatch(h1, id_list) will put h2 in id_list. 154 virtual bool Match(const CSeq_id_Handle& h1, 155 const CSeq_id_Handle& h2) const; 156 157 virtual bool IsBetterVersion(const CSeq_id_Handle& h1, 158 const CSeq_id_Handle& h2) const; 159 160 // Reverse matching 161 virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const; 162 virtual void FindReverseMatch(const CSeq_id_Handle& id, 163 TSeq_id_MatchList& id_list); 164 165 virtual size_t Dump(CNcbiOstream& out, 166 CSeq_id::E_Choice type, 167 int details) const = 0; 168 169 protected: 170 friend class CSeq_id_Mapper; 171 172 typedef CSeq_id_Info::TPacked TPacked; 173 174 CSeq_id_Info* CreateInfo(CSeq_id::E_Choice type); 175 CSeq_id_Info* CreateInfo(const CSeq_id& id); 176 GetInfo(const CSeq_id_Handle & id)177 static const CSeq_id_Info* GetInfo(const CSeq_id_Handle& id) 178 { 179 return id.m_Info; 180 } GetSeqId(const CSeq_id_Info * info)181 static const CSeq_id* GetSeqId(const CSeq_id_Info* info) 182 { 183 return info->m_Seq_id.GetPointerOrNull(); 184 } 185 virtual void x_Unindex(const CSeq_id_Info* info) = 0; 186 187 typedef CFastMutex TTreeLock; 188 typedef TTreeLock::TReadLockGuard TReadLockGuard; 189 typedef TTreeLock::TWriteLockGuard TWriteLockGuard; 190 191 mutable TTreeLock m_TreeLock; 192 CSeq_id_Mapper* m_Mapper; 193 194 private: 195 CSeq_id_Which_Tree(const CSeq_id_Which_Tree& tree); 196 const CSeq_id_Which_Tree& operator=(const CSeq_id_Which_Tree& tree); 197 }; 198 199 200 201 //////////////////////////////////////////////////////////////////// 202 // not-set tree (maximum 1 entry allowed) 203 204 205 class CSeq_id_not_set_Tree : public CSeq_id_Which_Tree 206 { 207 public: 208 CSeq_id_not_set_Tree(CSeq_id_Mapper* mapper); 209 ~CSeq_id_not_set_Tree(void); 210 211 virtual bool Empty(void) const; 212 213 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 214 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 215 216 virtual void DropInfo(const CSeq_id_Info* info); 217 218 virtual void FindMatch(const CSeq_id_Handle& id, 219 TSeq_id_MatchList& id_list) const; 220 virtual void FindMatchStr(const string& sid, 221 TSeq_id_MatchList& id_list) const; 222 virtual void FindReverseMatch(const CSeq_id_Handle& id, 223 TSeq_id_MatchList& id_list); 224 225 virtual size_t Dump(CNcbiOstream& out, 226 CSeq_id::E_Choice type, 227 int details) const; 228 229 protected: 230 virtual void x_Unindex(const CSeq_id_Info* info); 231 bool x_Check(const CSeq_id& id) const; 232 }; 233 234 235 //////////////////////////////////////////////////////////////////// 236 // Base class for Gi, Gibbsq & Gibbmt trees 237 238 239 class CSeq_id_int_Tree : public CSeq_id_Which_Tree 240 { 241 public: 242 CSeq_id_int_Tree(CSeq_id_Mapper* mapper); 243 ~CSeq_id_int_Tree(void); 244 245 virtual bool Empty(void) const; 246 247 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 248 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 249 250 virtual void FindMatchStr(const string& sid, 251 TSeq_id_MatchList& id_list) const; 252 253 virtual size_t Dump(CNcbiOstream& out, 254 CSeq_id::E_Choice type, 255 int details) const; 256 257 protected: 258 virtual void x_Unindex(const CSeq_id_Info* info); 259 virtual bool x_Check(const CSeq_id& id) const = 0; 260 virtual TPacked x_Get(const CSeq_id& id) const = 0; 261 262 private: 263 typedef map<TPacked, CSeq_id_Info*> TIntMap; 264 TIntMap m_IntMap; 265 }; 266 267 268 //////////////////////////////////////////////////////////////////// 269 // Gibbsq tree 270 271 272 class CSeq_id_Gibbsq_Tree : public CSeq_id_int_Tree 273 { 274 public: 275 CSeq_id_Gibbsq_Tree(CSeq_id_Mapper* mapper); 276 protected: 277 virtual bool x_Check(const CSeq_id& id) const; 278 virtual TPacked x_Get(const CSeq_id& id) const; 279 }; 280 281 282 //////////////////////////////////////////////////////////////////// 283 // Gibbmt tree 284 285 286 class CSeq_id_Gibbmt_Tree : public CSeq_id_int_Tree 287 { 288 public: 289 CSeq_id_Gibbmt_Tree(CSeq_id_Mapper* mapper); 290 protected: 291 virtual bool x_Check(const CSeq_id& id) const; 292 virtual TPacked x_Get(const CSeq_id& id) const; 293 }; 294 295 296 //////////////////////////////////////////////////////////////////// 297 // Gi tree 298 299 300 class CSeq_id_Gi_Info : public CSeq_id_Info 301 { 302 public: 303 CSeq_id_Gi_Info(CSeq_id_Mapper* mapper); 304 305 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant /*variant*/) const; 306 }; 307 308 309 class CSeq_id_Gi_Tree : public CSeq_id_Which_Tree 310 { 311 public: 312 CSeq_id_Gi_Tree(CSeq_id_Mapper* mapper); 313 ~CSeq_id_Gi_Tree(void); 314 315 virtual bool Empty(void) const; 316 317 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 318 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 319 virtual CSeq_id_Handle GetGiHandle(TGi gi); 320 321 virtual void FindMatchStr(const string& sid, 322 TSeq_id_MatchList& id_list) const; 323 324 virtual size_t Dump(CNcbiOstream& out, 325 CSeq_id::E_Choice type, 326 int details) const; 327 328 protected: 329 virtual void x_Unindex(const CSeq_id_Info* info); 330 bool x_Check(const CSeq_id& id) const; 331 TGi x_Get(const CSeq_id& id) const; 332 333 CSeq_id_Info* m_ZeroInfo; 334 CSeq_id_Info* m_SharedInfo; 335 }; 336 337 338 //////////////////////////////////////////////////////////////////// 339 // Base class for e_Genbank, e_Embl, e_Pir, e_Swissprot, e_Other, 340 // e_Ddbj, e_Prf, e_Tpg, e_Tpe, e_Tpd trees 341 342 343 class CSeq_id_Textseq_PlainInfo : public CSeq_id_Info 344 { 345 public: 346 CSeq_id_Textseq_PlainInfo(const CConstRef<CSeq_id>& seq_id, CSeq_id_Mapper* mapper); 347 348 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 349 350 TVariant ParseCaseVariant(const string& acc) const; 351 TVariant ParseCaseVariant(const CTextseq_id& id) const; 352 }; 353 354 355 class CSeq_id_Textseq_Info : public CSeq_id_Info { 356 public: 357 typedef CTextseq_id::TVersion TVersion; 358 359 struct TKey { TKeyCSeq_id_Textseq_Info::TKey360 TKey(void) 361 : m_Hash(0), m_Version(0) 362 { 363 } 364 365 unsigned m_Hash; 366 TVersion m_Version; 367 Uint1 m_PrefixLen; 368 enum { 369 kMaxPrefixLen = 7 370 }; 371 char m_PrefixBuf[kMaxPrefixLen]; 372 373 DECLARE_OPERATOR_BOOL(m_Hash != 0); 374 operator ==CSeq_id_Textseq_Info::TKey375 bool operator==(const TKey& b) const { 376 return m_Hash == b.m_Hash && m_Version == b.m_Version && 377 NStr::EqualNocase(GetAccPrefix(), b.GetAccPrefix()); 378 } operator !=CSeq_id_Textseq_Info::TKey379 bool operator!=(const TKey& b) const { 380 return !(*this == b); 381 } operator <CSeq_id_Textseq_Info::TKey382 bool operator<(const TKey& b) const { 383 return m_Hash < b.m_Hash || 384 (m_Hash == b.m_Hash && 385 (m_Version < b.m_Version || 386 (m_Version == b.m_Version && 387 NStr::CompareNocase(GetAccPrefix(), b.GetAccPrefix()) < 0))); 388 } 389 SameHashCSeq_id_Textseq_Info::TKey390 bool SameHash(const TKey& b) const { 391 return m_Hash == b.m_Hash; 392 } SameHashNoVerCSeq_id_Textseq_Info::TKey393 bool SameHashNoVer(const TKey& b) const { 394 return ((m_Hash ^ b.m_Hash) & ~1) == 0; 395 } EqualAccCSeq_id_Textseq_Info::TKey396 bool EqualAcc(const TKey& b) const { 397 return SameHashNoVer(b) && 398 NStr::EqualNocase(GetAccPrefix(), b.GetAccPrefix()); 399 } 400 IsSetVersionCSeq_id_Textseq_Info::TKey401 bool IsSetVersion(void) const { 402 return (m_Hash & 1) != 0; 403 } GetVersionCSeq_id_Textseq_Info::TKey404 const TVersion& GetVersion(void) const { 405 _ASSERT(IsSetVersion()); 406 return m_Version; 407 } ResetVersionCSeq_id_Textseq_Info::TKey408 void ResetVersion(void) { 409 m_Hash &= ~1; 410 m_Version = 0; 411 } SetVersionCSeq_id_Textseq_Info::TKey412 void SetVersion(TVersion version) { 413 m_Hash |= 1; 414 m_Version = version; 415 } GetAccDigitsCSeq_id_Textseq_Info::TKey416 int GetAccDigits(void) const { 417 return (m_Hash & 0xff) >> 1; 418 } 419 TVariant ParseCaseVariant(const string& acc) const; 420 GetPrefixLenCSeq_id_Textseq_Info::TKey421 size_t GetPrefixLen() const { 422 return m_PrefixLen; 423 } GetAccPrefixCSeq_id_Textseq_Info::TKey424 CTempString GetAccPrefix(void) const { 425 return CTempString(m_PrefixBuf, m_PrefixLen); 426 } 427 }; 428 CSeq_id_Textseq_Info(CSeq_id::E_Choice type, 429 CSeq_id_Mapper* mapper, 430 const TKey& key); 431 ~CSeq_id_Textseq_Info(void); 432 GetKey(void) const433 const TKey& GetKey(void) const { 434 return m_Key; 435 } GetAccPrefix(void) const436 CTempString GetAccPrefix(void) const { 437 return m_Key.GetAccPrefix(); 438 } GoodPrefix(const CTempString & acc) const439 bool GoodPrefix(const CTempString& acc) const { 440 return NStr::StartsWith(acc, GetAccPrefix(), NStr::eNocase); 441 } GetAccDigits(void) const442 int GetAccDigits(void) const { 443 return m_Key.GetAccDigits(); 444 } IsSetVersion(void) const445 bool IsSetVersion(void) const { 446 return m_Key.IsSetVersion(); 447 } GetVersion(void) const448 const TVersion& GetVersion(void) const { 449 return m_Key.GetVersion(); 450 } 451 void RestoreAccession(string& acc, TPacked param, TVariant variant) const; 452 void Restore(CTextseq_id& id, TPacked param, TVariant variant) const; 453 454 static TKey ParseAcc(const string& acc, const TVersion* ver); ParseAcc(const string & acc,const CTextseq_id & tid)455 static TKey ParseAcc(const string& acc, const CTextseq_id& tid) { 456 TVersion ver; 457 const TVersion *ver_ptr = 0; 458 if ( tid.IsSetVersion() ) { 459 ver = tid.GetVersion(); 460 ver_ptr = &ver; 461 } 462 return ParseAcc(acc, ver_ptr); 463 } 464 static TPacked Pack(const TKey& key, const string& acc); 465 static TPacked Pack(const TKey& key, const CTextseq_id& id); 466 static TVariant ParseCaseVariant(const CSeq_id_Info* info, const string& acc); 467 468 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 469 470 private: 471 TKey m_Key; 472 }; 473 474 475 class CSeq_id_Textseq_Tree : public CSeq_id_Which_Tree 476 { 477 public: 478 typedef CTextseq_id::TVersion TVersion; 479 480 CSeq_id_Textseq_Tree(CSeq_id_Mapper* mapper, CSeq_id::E_Choice type); 481 ~CSeq_id_Textseq_Tree(void); 482 483 virtual bool Empty(void) const; 484 485 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 486 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 487 488 virtual bool HaveMatch(const CSeq_id_Handle& id) const; 489 virtual void FindMatch(const CSeq_id_Handle& id, 490 TSeq_id_MatchList& id_list) const; 491 virtual void FindMatchStr(const string& sid, 492 TSeq_id_MatchList& id_list) const; 493 494 virtual bool Match(const CSeq_id_Handle& h1, 495 const CSeq_id_Handle& h2) const; 496 virtual bool IsBetterVersion(const CSeq_id_Handle& h1, 497 const CSeq_id_Handle& h2) const; 498 499 virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const; 500 virtual void FindReverseMatch(const CSeq_id_Handle& id, 501 TSeq_id_MatchList& id_list); 502 503 virtual size_t Dump(CNcbiOstream& out, 504 CSeq_id::E_Choice type, 505 int details) const; 506 507 protected: 508 virtual void x_Unindex(const CSeq_id_Info* info); 509 virtual bool x_Check(const CSeq_id::E_Choice& type) const; 510 virtual bool x_Check(const CSeq_id& id) const; x_Get(const CSeq_id & id) const511 const CTextseq_id& x_Get(const CSeq_id& id) const { 512 const CTextseq_id* text_id = id.GetTextseq_Id(); 513 _ASSERT(text_id); 514 return *text_id; 515 } 516 CSeq_id_Textseq_PlainInfo* x_FindStrInfo(CSeq_id::E_Choice type, 517 const CTextseq_id& tid) const; 518 bool x_GetVersion(TVersion& version, const CSeq_id_Handle& id) const; 519 520 private: 521 typedef multimap<string, CSeq_id_Textseq_PlainInfo*, PNocase> TStringMap; 522 typedef TStringMap::value_type TStringMapValue; 523 typedef TStringMap::const_iterator TStringMapCI; 524 typedef pair<TStringMapCI, TStringMapCI> TVersions; 525 typedef CSeq_id_Textseq_Info::TKey TPackedKey; 526 typedef map<TPackedKey, CConstRef<CSeq_id_Textseq_Info> > TPackedMap; 527 typedef TPackedMap::value_type TPackedMapValue; 528 typedef TPackedMap::iterator TPackedMap_I; 529 typedef TPackedMap::const_iterator TPackedMap_CI; 530 531 static bool x_Equals(const CTextseq_id& id1, const CTextseq_id& id2); 532 static void x_Erase(TStringMap& str_map, 533 const string& key, 534 const CSeq_id_Info* info); 535 536 CSeq_id_Textseq_PlainInfo* x_FindStrInfo(const TStringMap& str_map, 537 const string& str, 538 CSeq_id::E_Choice type, 539 const CTextseq_id& tid) const; 540 541 void x_FindMatchByAcc(TSeq_id_MatchList& id_list, 542 const string& acc, 543 const TVersion* ver = 0) const; x_FindMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const CTextseq_id * tid) const544 void x_FindMatchByAcc(TSeq_id_MatchList& id_list, 545 const string& acc, 546 const CTextseq_id* tid) const { 547 TVersion ver, *ver_ptr = 0; 548 if ( tid && tid->IsSetVersion() ) { 549 ver = tid->GetVersion(); 550 ver_ptr = &ver; 551 } 552 x_FindMatchByAcc(id_list, acc, ver_ptr); 553 } 554 void x_FindMatchByName(TSeq_id_MatchList& id_list, 555 const string& name, 556 const CTextseq_id* tid = 0) const; 557 558 void x_FindRevMatchByAccPacked(TSeq_id_MatchList& id_list, 559 const string& acc, 560 const TVersion* ver = 0) const; 561 void x_FindRevMatchByAccNonPacked(TSeq_id_MatchList& id_list, 562 const string& acc, 563 const TVersion* ver = 0) const; 564 void x_FindRevMatchByAcc(TSeq_id_MatchList& id_list, 565 const string& acc, 566 const TVersion* ver = 0) const; x_FindRevMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const CTextseq_id * tid) const567 void x_FindRevMatchByAcc(TSeq_id_MatchList& id_list, 568 const string& acc, 569 const CTextseq_id* tid) const { 570 TVersion ver, *ver_ptr = 0; 571 if ( tid && tid->IsSetVersion() ) { 572 ver = tid->GetVersion(); 573 ver_ptr = &ver; 574 } 575 x_FindRevMatchByAcc(id_list, acc, ver_ptr); 576 } 577 void x_FindRevMatchByName(TSeq_id_MatchList& id_list, 578 const string& name, 579 const CTextseq_id* tid = 0) const; 580 581 CSeq_id::E_Choice m_Type; 582 TStringMap m_ByAcc; 583 TStringMap m_ByName; // Used for searching by string 584 TPackedMap m_PackedMap; 585 }; 586 587 588 //////////////////////////////////////////////////////////////////// 589 // Genbank, EMBL and DDBJ joint tree 590 591 592 class CSeq_id_GB_Tree : public CSeq_id_Textseq_Tree 593 { 594 public: 595 CSeq_id_GB_Tree(CSeq_id_Mapper* mapper); 596 protected: 597 virtual bool x_Check(const CSeq_id::E_Choice& type) const; 598 }; 599 600 601 //////////////////////////////////////////////////////////////////// 602 // Pir tree 603 604 605 class CSeq_id_Pir_Tree : public CSeq_id_Textseq_Tree 606 { 607 public: 608 CSeq_id_Pir_Tree(CSeq_id_Mapper* mapper); 609 }; 610 611 612 //////////////////////////////////////////////////////////////////// 613 // Swissprot 614 615 616 class CSeq_id_Swissprot_Tree : public CSeq_id_Textseq_Tree 617 { 618 public: 619 CSeq_id_Swissprot_Tree(CSeq_id_Mapper* mapper); 620 }; 621 622 623 //////////////////////////////////////////////////////////////////// 624 // Prf tree 625 626 627 class CSeq_id_Prf_Tree : public CSeq_id_Textseq_Tree 628 { 629 public: 630 CSeq_id_Prf_Tree(CSeq_id_Mapper* mapper); 631 }; 632 633 634 //////////////////////////////////////////////////////////////////// 635 // Tpg tree 636 637 638 class CSeq_id_Tpg_Tree : public CSeq_id_Textseq_Tree 639 { 640 public: 641 CSeq_id_Tpg_Tree(CSeq_id_Mapper* mapper); 642 }; 643 644 645 //////////////////////////////////////////////////////////////////// 646 // Tpe tree 647 648 649 class CSeq_id_Tpe_Tree : public CSeq_id_Textseq_Tree 650 { 651 public: 652 CSeq_id_Tpe_Tree(CSeq_id_Mapper* mapper); 653 }; 654 655 656 //////////////////////////////////////////////////////////////////// 657 // Tpd tree 658 659 660 class CSeq_id_Tpd_Tree : public CSeq_id_Textseq_Tree 661 { 662 public: 663 CSeq_id_Tpd_Tree(CSeq_id_Mapper* mapper); 664 }; 665 666 667 //////////////////////////////////////////////////////////////////// 668 // Gpipe tree 669 670 671 class CSeq_id_Gpipe_Tree : public CSeq_id_Textseq_Tree 672 { 673 public: 674 CSeq_id_Gpipe_Tree(CSeq_id_Mapper* mapper); 675 }; 676 677 678 //////////////////////////////////////////////////////////////////// 679 // Named-annot-track tree 680 681 682 class CSeq_id_Named_annot_track_Tree : public CSeq_id_Textseq_Tree 683 { 684 public: 685 CSeq_id_Named_annot_track_Tree(CSeq_id_Mapper* mapper); 686 }; 687 688 689 //////////////////////////////////////////////////////////////////// 690 // Other tree 691 692 693 class CSeq_id_Other_Tree : public CSeq_id_Textseq_Tree 694 { 695 public: 696 CSeq_id_Other_Tree(CSeq_id_Mapper* mapper); 697 }; 698 699 700 //////////////////////////////////////////////////////////////////// 701 // e_Local tree 702 703 704 class CSeq_id_Local_Info : public CSeq_id_Info { 705 public: 706 CSeq_id_Local_Info(const CObject_id& oid, CSeq_id_Mapper* mapper); 707 ~CSeq_id_Local_Info(void); 708 IsId() const709 bool IsId() const { 710 return m_IsId; 711 } HasMatchingId() const712 bool HasMatchingId() const { 713 return m_HasMatchingId; 714 } GetMatchingId() const715 CObject_id::TId GetMatchingId() const { 716 return m_MatchingId; 717 } 718 719 TVariant ParseCaseVariant(const string& str) const; 720 TVariant ParseCaseVariant(const CObject_id& oid) const; 721 722 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 723 724 private: 725 bool m_IsId; 726 bool m_HasMatchingId; 727 CObject_id::TId m_MatchingId; 728 }; 729 730 731 class CSeq_id_Local_Tree : public CSeq_id_Which_Tree 732 { 733 public: 734 CSeq_id_Local_Tree(CSeq_id_Mapper* mapper); 735 ~CSeq_id_Local_Tree(void); 736 737 virtual bool Empty(void) const; 738 739 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 740 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 741 742 // Get the list of matching seq-id (int id = str id). 743 virtual bool HaveMatch(const CSeq_id_Handle& id) const; 744 virtual void FindMatch(const CSeq_id_Handle& id, 745 TSeq_id_MatchList& id_list) const; 746 virtual void FindMatchStr(const string& sid, 747 TSeq_id_MatchList& id_list) const; 748 749 virtual size_t Dump(CNcbiOstream& out, 750 CSeq_id::E_Choice type, 751 int details) const; 752 753 private: 754 virtual void x_Unindex(const CSeq_id_Info* info); 755 CSeq_id_Local_Info* x_FindInfo(const CObject_id& oid) const; 756 CSeq_id_Local_Info* x_FindStrInfo(const string& str) const; 757 CSeq_id_Local_Info* x_FindIdInfo(CObject_id::TId id) const; 758 759 typedef unordered_map<string, CSeq_id_Local_Info*, PHashNocase, PEqualNocase> TByStr; 760 typedef map<CObject_id::TId, CSeq_id_Local_Info*> TById; 761 762 TByStr m_ByStr; 763 TById m_ById; 764 }; 765 766 767 //////////////////////////////////////////////////////////////////// 768 // e_General tree 769 770 771 class CSeq_id_General_Id_Info : public CSeq_id_Info { 772 public: 773 typedef string TKey; 774 typedef PNocase PKeyLess; 775 776 CSeq_id_General_Id_Info(CSeq_id_Mapper* mapper, const TKey& key); 777 ~CSeq_id_General_Id_Info(void); 778 GetKey(void) const779 const TKey& GetKey(void) const { 780 return m_Key; 781 } GetDbtag(void) const782 const string& GetDbtag(void) const { 783 return m_Key; 784 } 785 void Restore(CDbtag& id, TPacked param, TVariant variant) const; 786 787 static TPacked Pack(const TKey& key, const CDbtag& id); 788 789 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 790 791 private: 792 TKey m_Key; 793 }; 794 795 796 class CSeq_id_General_Str_Info : public CSeq_id_Info { 797 public: 798 struct TKey { 799 TPacked m_Key; 800 string m_Db; 801 string m_StrPrefix; 802 string m_StrSuffix; operator ==CSeq_id_General_Str_Info::TKey803 bool operator==(const TKey& b) const { 804 return m_Key == b.m_Key && 805 PEqualNocase()(m_StrSuffix, b.m_StrSuffix) && 806 PEqualNocase()(m_StrPrefix, b.m_StrPrefix) && 807 PEqualNocase()(m_Db, b.m_Db); 808 } operator !=CSeq_id_General_Str_Info::TKey809 bool operator!=(const TKey& b) const { 810 return !(*this == b); 811 } GetStrDigitsCSeq_id_General_Str_Info::TKey812 size_t GetStrDigits(void) const { 813 return m_Key & 0xff; 814 } 815 TVariant ParseCaseVariant(const CDbtag& dbtag) const; 816 }; 817 struct PKeyLess { operator ()CSeq_id_General_Str_Info::PKeyLess818 bool operator()(const TKey& a, const TKey& b) const { 819 if ( a.m_Key != b.m_Key ) { 820 return a.m_Key < b.m_Key; 821 } 822 int diff = NStr::CompareNocase(a.m_StrSuffix, b.m_StrSuffix); 823 if ( diff == 0 ) { 824 diff = NStr::CompareNocase(a.m_StrPrefix, b.m_StrPrefix); 825 if ( diff == 0 ) { 826 diff = NStr::CompareNocase(a.m_Db, b.m_Db); 827 } 828 } 829 return diff < 0; 830 } 831 }; 832 struct PHash { operator ()CSeq_id_General_Str_Info::PHash833 TPacked operator()(const TKey& a) const { 834 return a.m_Key; 835 } 836 }; 837 838 CSeq_id_General_Str_Info(CSeq_id_Mapper* mapper, const TKey& key); 839 ~CSeq_id_General_Str_Info(void); 840 GetKey(void) const841 const TKey& GetKey(void) const { 842 return m_Key; 843 } GetDbtag(void) const844 const string& GetDbtag(void) const { 845 return m_Key.m_Db; 846 } GetStrPrefix(void) const847 const string& GetStrPrefix(void) const { 848 return m_Key.m_StrPrefix; 849 } GetStrSuffix(void) const850 const string& GetStrSuffix(void) const { 851 return m_Key.m_StrSuffix; 852 } GetStrDigits(void) const853 size_t GetStrDigits(void) const { 854 return m_Key.GetStrDigits(); 855 } 856 void Restore(CDbtag& id, TPacked param, TVariant variant) const; 857 858 static TKey Parse(const CDbtag& id); 859 static TPacked Pack(const TKey& key, const CDbtag& id); 860 861 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 862 863 private: 864 TKey m_Key; 865 }; 866 867 868 class CSeq_id_General_PlainInfo : public CSeq_id_Info { 869 public: 870 CSeq_id_General_PlainInfo(const CDbtag& dbid, CSeq_id_Mapper* mapper); 871 872 virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const; 873 874 TVariant ParseCaseVariant(const CDbtag& dbtag) const; 875 }; 876 877 878 class CSeq_id_General_Tree : public CSeq_id_Which_Tree 879 { 880 public: 881 CSeq_id_General_Tree(CSeq_id_Mapper* mapper); 882 ~CSeq_id_General_Tree(void); 883 884 virtual bool Empty(void) const; 885 886 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 887 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 888 889 // Get the list of matching seq-id (int id = str id). 890 virtual bool HaveMatch(const CSeq_id_Handle& id) const; 891 virtual void FindMatch(const CSeq_id_Handle& id, 892 TSeq_id_MatchList& id_list) const; 893 virtual void FindMatchStr(const string& sid, 894 TSeq_id_MatchList& id_list) const; 895 896 virtual size_t Dump(CNcbiOstream& out, 897 CSeq_id::E_Choice type, 898 int details) const; 899 900 private: 901 virtual void x_Unindex(const CSeq_id_Info* info); 902 CSeq_id_General_PlainInfo* x_FindInfo(const CDbtag& dbid) const; 903 904 struct STagMap { 905 public: 906 typedef unordered_map<string, CSeq_id_General_PlainInfo*, PHashNocase, PEqualNocase> TByStr; 907 typedef map<TPacked, CSeq_id_General_PlainInfo*> TById; 908 TByStr m_ByStr; 909 TById m_ById; 910 }; 911 typedef map<string, STagMap, PNocase> TDbMap; 912 typedef CSeq_id_General_Id_Info::TKey TPackedIdKey; 913 typedef map<TPackedIdKey, CConstRef<CSeq_id_General_Id_Info>, 914 CSeq_id_General_Id_Info::PKeyLess> TPackedIdMap; 915 typedef CSeq_id_General_Str_Info::TKey TPackedStrKey; 916 typedef unordered_map<TPackedStrKey, CConstRef<CSeq_id_General_Str_Info>, 917 CSeq_id_General_Str_Info::PHash> TPackedStrMap; 918 919 TDbMap m_DbMap; 920 TPackedIdMap m_PackedIdMap; 921 TPackedStrMap m_PackedStrMap; 922 }; 923 924 925 //////////////////////////////////////////////////////////////////// 926 // e_Giim tree 927 928 929 class CSeq_id_Giim_Tree : public CSeq_id_Which_Tree 930 { 931 public: 932 CSeq_id_Giim_Tree(CSeq_id_Mapper* mapper); 933 ~CSeq_id_Giim_Tree(void); 934 935 virtual bool Empty(void) const; 936 937 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 938 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 939 940 virtual void FindMatchStr(const string& sid, 941 TSeq_id_MatchList& id_list) const; 942 943 virtual size_t Dump(CNcbiOstream& out, 944 CSeq_id::E_Choice type, 945 int details) const; 946 947 private: 948 virtual void x_Unindex(const CSeq_id_Info* info); 949 CSeq_id_Info* x_FindInfo(const CGiimport_id& gid) const; 950 951 // 2-level indexing: first by Id, second by Db+Release 952 typedef vector<CSeq_id_Info*> TGiimList; 953 typedef map<TPacked, TGiimList> TIdMap; 954 955 TIdMap m_IdMap; 956 }; 957 958 959 //////////////////////////////////////////////////////////////////// 960 // e_Patent tree 961 962 963 class CSeq_id_Patent_Tree : public CSeq_id_Which_Tree 964 { 965 public: 966 CSeq_id_Patent_Tree(CSeq_id_Mapper* mapper); 967 ~CSeq_id_Patent_Tree(void); 968 969 virtual bool Empty(void) const; 970 971 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 972 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 973 974 virtual void FindMatchStr(const string& sid, 975 TSeq_id_MatchList& id_list) const; 976 977 virtual size_t Dump(CNcbiOstream& out, 978 CSeq_id::E_Choice type, 979 int details) const; 980 981 private: 982 virtual void x_Unindex(const CSeq_id_Info* info); 983 CSeq_id_Info* x_FindInfo(const CPatent_seq_id& pid) const; 984 985 // 3-level indexing: country, (number|app_number), seqid. 986 // Ignoring patent doc-type in indexing. 987 struct SPat_idMap { 988 typedef map<TPacked, CSeq_id_Info*> TBySeqid; 989 typedef map<string, TBySeqid, PNocase> TByNumber; // or by App_number 990 991 TByNumber m_ByNumber; 992 TByNumber m_ByApp_number; 993 }; 994 typedef map<string, SPat_idMap, PNocase> TByCountry; 995 996 TByCountry m_CountryMap; 997 }; 998 999 1000 //////////////////////////////////////////////////////////////////// 1001 // e_PDB tree 1002 1003 1004 class CSeq_id_PDB_Tree : public CSeq_id_Which_Tree 1005 { 1006 public: 1007 CSeq_id_PDB_Tree(CSeq_id_Mapper* mapper); 1008 ~CSeq_id_PDB_Tree(void); 1009 1010 virtual bool Empty(void) const; 1011 1012 virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const; 1013 virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id); 1014 1015 virtual bool HaveMatch(const CSeq_id_Handle& id) const; 1016 virtual void FindMatch(const CSeq_id_Handle& id, 1017 TSeq_id_MatchList& id_list) const; 1018 virtual void FindMatchStr(const string& sid, 1019 TSeq_id_MatchList& id_list) const; 1020 virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const; 1021 virtual void FindReverseMatch(const CSeq_id_Handle& id, 1022 TSeq_id_MatchList& id_list); 1023 1024 virtual size_t Dump(CNcbiOstream& out, 1025 CSeq_id::E_Choice type, 1026 int details) const; 1027 1028 private: 1029 virtual void x_Unindex(const CSeq_id_Info* info); 1030 1031 string x_IdToStrKey(const CPDB_seq_id& id) const; 1032 1033 // Index by mol+chain, no date - too complicated 1034 typedef vector<CSeq_id_Info*> TSubMolList; 1035 typedef map<string, TSubMolList, PCase> TMolMap; 1036 1037 TMolMap m_MolMap; 1038 }; 1039 1040 1041 // Seq-id mapper exception 1042 class NCBI_SEQ_EXPORT CSeq_id_MapperException : public CException 1043 { 1044 public: 1045 enum EErrCode { 1046 eTypeError, 1047 eSymbolError, 1048 eEmptyError, 1049 eOtherError 1050 }; 1051 const char* GetErrCodeString(void) const override; 1052 NCBI_EXCEPTION_DEFAULT(CSeq_id_MapperException,CException); 1053 }; 1054 1055 1056 ///////////////////////////////////////////////////////////////////////////// 1057 // 1058 // Inline methods 1059 // 1060 ///////////////////////////////////////////////////////////////////////////// 1061 1062 END_SCOPE(objects) 1063 END_NCBI_SCOPE 1064 1065 #endif /* OBJECTS_OBJMGR___SEQ_ID_TREE__HPP */ 1066