1 #ifndef OBJECTS_OBJMGR___SEQ_ID_TREE__HPP
2 #define OBJECTS_OBJMGR___SEQ_ID_TREE__HPP
3 
4 /*  $Id: seq_id_tree.hpp 629051 2021-04-09 11:36:07Z ivanov $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Aleksey Grichenko, Eugene Vasilchenko
30 *
31 * File Description:
32 *   Seq-id mapper for Object Manager
33 *
34 */
35 
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbimtx.hpp>
38 #include <corelib/ncbistr.hpp>
39 #include <corelib/ncbi_limits.hpp>
40 
41 #include <objects/general/Date.hpp>
42 #include <objects/general/Dbtag.hpp>
43 #include <objects/general/Object_id.hpp>
44 
45 #include <objects/biblio/Id_pat.hpp>
46 
47 #include <objects/seqloc/Seq_id.hpp>
48 #include <objects/seqloc/PDB_mol_id.hpp>
49 #include <objects/seqloc/PDB_seq_id.hpp>
50 #include <objects/seqloc/Patent_seq_id.hpp>
51 #include <objects/seqloc/Giimport_id.hpp>
52 #include <objects/seqloc/Textseq_id.hpp>
53 
54 #include <objects/seq/seq_id_handle.hpp>
55 
56 #include <vector>
57 #include <set>
58 #include <map>
59 #include <unordered_map>
60 
61 BEGIN_NCBI_SCOPE
62 BEGIN_SCOPE(objects)
63 
64 
65 class CSeq_id;
66 class CSeq_id_Handle;
67 class CSeq_id_Info;
68 class CSeq_id_Mapper;
69 class CSeq_id_Which_Tree;
70 
71 struct PHashNocase {
get_hashPHashNocase72     static char get_hash(char c)
73         {
74             // In ids only ASCII characters are allowed, and in ASCII
75             // upper and lower cases differ only by one bit.
76             // So for efficiency it's enough to reset that bit
77             // instead of using more complex tolower().
78             return c&~32;
79             //return tolower(c);
80         }
operator ()PHashNocase81     size_t operator()(const string& s) const
82         {
83             size_t h = s.size();
84             for ( auto c : s )
85                 h = h*17 + get_hash(c);
86             return h;
87         }
88 };
89 struct PEqualNocase {
operator ()PEqualNocase90     bool operator()(const string& s1, const string& s2) const
91         {
92             // in most cases letter cases match,
93             // so it's faster first to check that
94             // with more efficient direct string comparison
95             if ( s1 == s2 ) {
96                 return true;
97             }
98             // otherwise we first check if lengths are the same
99             size_t len = s1.size();
100             if ( s2.size() != len ) {
101                 return false;
102             }
103             for ( size_t i = 0; i < len; ++i ) {
104                 char c1 = s1[i];
105                 char c2 = s2[i];
106                 if ( tolower((unsigned char)c1) != tolower((unsigned char)c2) ) {
107                     return false;
108                 }
109             }
110             return true;
111             // commented out old less efficient comparison
112             //return NStr::EqualNocase(s1, s2);
113         }
114 };
115 
116 ////////////////////////////////////////////////////////////////////
117 //
118 //  CSeq_id_***_Tree::
119 //
120 //    Seq-id sub-type specific trees
121 //
122 
123 
124 // Base class for seq-id type-specific trees
125 class CSeq_id_Which_Tree : public CObject
126 {
127 public:
128     // 'ctors
129     CSeq_id_Which_Tree(CSeq_id_Mapper* mapper);
130     virtual ~CSeq_id_Which_Tree(void);
131 
132     static void Initialize(CSeq_id_Mapper* mapper,
133                            vector<CRef<CSeq_id_Which_Tree> >& v);
134 
135     virtual bool Empty(void) const = 0;
136 
137     // Find exaclty the same seq-id
138     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const = 0;
139     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id) = 0;
140     virtual CSeq_id_Handle GetGiHandle(TGi gi);
141 
142     virtual void DropInfo(const CSeq_id_Info* info);
143 
144     typedef set<CSeq_id_Handle> TSeq_id_MatchList;
145 
146     // Get the list of matching seq-id.
147     virtual bool HaveMatch(const CSeq_id_Handle& id) const;
148     virtual void FindMatch(const CSeq_id_Handle& id,
149                            TSeq_id_MatchList& id_list) const;
150     virtual void FindMatchStr(const string& sid,
151                               TSeq_id_MatchList& id_list) const = 0;
152 
153     // returns true if FindMatch(h1, id_list) will put h2 in id_list.
154     virtual bool Match(const CSeq_id_Handle& h1,
155                        const CSeq_id_Handle& h2) const;
156 
157     virtual bool IsBetterVersion(const CSeq_id_Handle& h1,
158                                  const CSeq_id_Handle& h2) const;
159 
160     // Reverse matching
161     virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const;
162     virtual void FindReverseMatch(const CSeq_id_Handle& id,
163                                   TSeq_id_MatchList& id_list);
164 
165     virtual size_t Dump(CNcbiOstream& out,
166                         CSeq_id::E_Choice type,
167                         int details) const = 0;
168 
169 protected:
170     friend class CSeq_id_Mapper;
171 
172     typedef CSeq_id_Info::TPacked TPacked;
173 
174     CSeq_id_Info* CreateInfo(CSeq_id::E_Choice type);
175     CSeq_id_Info* CreateInfo(const CSeq_id& id);
176 
GetInfo(const CSeq_id_Handle & id)177     static const CSeq_id_Info* GetInfo(const CSeq_id_Handle& id)
178         {
179             return id.m_Info;
180         }
GetSeqId(const CSeq_id_Info * info)181     static const CSeq_id* GetSeqId(const CSeq_id_Info* info)
182         {
183             return info->m_Seq_id.GetPointerOrNull();
184         }
185     virtual void x_Unindex(const CSeq_id_Info* info) = 0;
186 
187     typedef CFastMutex TTreeLock;
188     typedef TTreeLock::TReadLockGuard TReadLockGuard;
189     typedef TTreeLock::TWriteLockGuard TWriteLockGuard;
190 
191     mutable TTreeLock m_TreeLock;
192     CSeq_id_Mapper* m_Mapper;
193 
194 private:
195     CSeq_id_Which_Tree(const CSeq_id_Which_Tree& tree);
196     const CSeq_id_Which_Tree& operator=(const CSeq_id_Which_Tree& tree);
197 };
198 
199 
200 
201 ////////////////////////////////////////////////////////////////////
202 // not-set tree (maximum 1 entry allowed)
203 
204 
205 class CSeq_id_not_set_Tree : public CSeq_id_Which_Tree
206 {
207 public:
208     CSeq_id_not_set_Tree(CSeq_id_Mapper* mapper);
209     ~CSeq_id_not_set_Tree(void);
210 
211     virtual bool Empty(void) const;
212 
213     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
214     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
215 
216     virtual void DropInfo(const CSeq_id_Info* info);
217 
218     virtual void FindMatch(const CSeq_id_Handle& id,
219                            TSeq_id_MatchList& id_list) const;
220     virtual void FindMatchStr(const string& sid,
221                               TSeq_id_MatchList& id_list) const;
222     virtual void FindReverseMatch(const CSeq_id_Handle& id,
223                                   TSeq_id_MatchList& id_list);
224 
225     virtual size_t Dump(CNcbiOstream& out,
226                         CSeq_id::E_Choice type,
227                         int details) const;
228 
229 protected:
230     virtual void x_Unindex(const CSeq_id_Info* info);
231     bool x_Check(const CSeq_id& id) const;
232 };
233 
234 
235 ////////////////////////////////////////////////////////////////////
236 // Base class for Gi, Gibbsq & Gibbmt trees
237 
238 
239 class CSeq_id_int_Tree : public CSeq_id_Which_Tree
240 {
241 public:
242     CSeq_id_int_Tree(CSeq_id_Mapper* mapper);
243     ~CSeq_id_int_Tree(void);
244 
245     virtual bool Empty(void) const;
246 
247     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
248     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
249 
250     virtual void FindMatchStr(const string& sid,
251                               TSeq_id_MatchList& id_list) const;
252 
253     virtual size_t Dump(CNcbiOstream& out,
254                         CSeq_id::E_Choice type,
255                         int details) const;
256 
257 protected:
258     virtual void x_Unindex(const CSeq_id_Info* info);
259     virtual bool x_Check(const CSeq_id& id) const = 0;
260     virtual TPacked x_Get(const CSeq_id& id) const = 0;
261 
262 private:
263     typedef map<TPacked, CSeq_id_Info*> TIntMap;
264     TIntMap m_IntMap;
265 };
266 
267 
268 ////////////////////////////////////////////////////////////////////
269 // Gibbsq tree
270 
271 
272 class CSeq_id_Gibbsq_Tree : public CSeq_id_int_Tree
273 {
274 public:
275     CSeq_id_Gibbsq_Tree(CSeq_id_Mapper* mapper);
276 protected:
277     virtual bool x_Check(const CSeq_id& id) const;
278     virtual TPacked x_Get(const CSeq_id& id) const;
279 };
280 
281 
282 ////////////////////////////////////////////////////////////////////
283 // Gibbmt tree
284 
285 
286 class CSeq_id_Gibbmt_Tree : public CSeq_id_int_Tree
287 {
288 public:
289     CSeq_id_Gibbmt_Tree(CSeq_id_Mapper* mapper);
290 protected:
291     virtual bool x_Check(const CSeq_id& id) const;
292     virtual TPacked x_Get(const CSeq_id& id) const;
293 };
294 
295 
296 ////////////////////////////////////////////////////////////////////
297 // Gi tree
298 
299 
300 class CSeq_id_Gi_Info : public CSeq_id_Info
301 {
302 public:
303     CSeq_id_Gi_Info(CSeq_id_Mapper* mapper);
304 
305     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant /*variant*/) const;
306 };
307 
308 
309 class CSeq_id_Gi_Tree : public CSeq_id_Which_Tree
310 {
311 public:
312     CSeq_id_Gi_Tree(CSeq_id_Mapper* mapper);
313     ~CSeq_id_Gi_Tree(void);
314 
315     virtual bool Empty(void) const;
316 
317     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
318     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
319     virtual CSeq_id_Handle GetGiHandle(TGi gi);
320 
321     virtual void FindMatchStr(const string& sid,
322                               TSeq_id_MatchList& id_list) const;
323 
324     virtual size_t Dump(CNcbiOstream& out,
325                         CSeq_id::E_Choice type,
326                         int details) const;
327 
328 protected:
329     virtual void x_Unindex(const CSeq_id_Info* info);
330     bool x_Check(const CSeq_id& id) const;
331     TGi x_Get(const CSeq_id& id) const;
332 
333     CSeq_id_Info* m_ZeroInfo;
334     CSeq_id_Info* m_SharedInfo;
335 };
336 
337 
338 ////////////////////////////////////////////////////////////////////
339 // Base class for e_Genbank, e_Embl, e_Pir, e_Swissprot, e_Other,
340 // e_Ddbj, e_Prf, e_Tpg, e_Tpe, e_Tpd trees
341 
342 
343 class CSeq_id_Textseq_PlainInfo : public CSeq_id_Info
344 {
345 public:
346     CSeq_id_Textseq_PlainInfo(const CConstRef<CSeq_id>& seq_id, CSeq_id_Mapper* mapper);
347 
348     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
349 
350     TVariant ParseCaseVariant(const string& acc) const;
351     TVariant ParseCaseVariant(const CTextseq_id& id) const;
352 };
353 
354 
355 class CSeq_id_Textseq_Info : public CSeq_id_Info {
356 public:
357     typedef CTextseq_id::TVersion TVersion;
358 
359     struct TKey {
TKeyCSeq_id_Textseq_Info::TKey360         TKey(void)
361             : m_Hash(0), m_Version(0)
362             {
363             }
364 
365         unsigned m_Hash;
366         TVersion m_Version;
367         Uint1 m_PrefixLen;
368         enum {
369             kMaxPrefixLen = 7
370         };
371         char m_PrefixBuf[kMaxPrefixLen];
372 
373         DECLARE_OPERATOR_BOOL(m_Hash != 0);
374 
operator ==CSeq_id_Textseq_Info::TKey375         bool operator==(const TKey& b) const {
376             return m_Hash == b.m_Hash && m_Version == b.m_Version &&
377                 NStr::EqualNocase(GetAccPrefix(), b.GetAccPrefix());
378         }
operator !=CSeq_id_Textseq_Info::TKey379         bool operator!=(const TKey& b) const {
380             return !(*this == b);
381         }
operator <CSeq_id_Textseq_Info::TKey382         bool operator<(const TKey& b) const {
383             return m_Hash < b.m_Hash ||
384                 (m_Hash == b.m_Hash &&
385                  (m_Version < b.m_Version ||
386                   (m_Version == b.m_Version &&
387                    NStr::CompareNocase(GetAccPrefix(), b.GetAccPrefix()) < 0)));
388         }
389 
SameHashCSeq_id_Textseq_Info::TKey390         bool SameHash(const TKey& b) const {
391             return m_Hash == b.m_Hash;
392         }
SameHashNoVerCSeq_id_Textseq_Info::TKey393         bool SameHashNoVer(const TKey& b) const {
394             return ((m_Hash ^ b.m_Hash) & ~1) == 0;
395         }
EqualAccCSeq_id_Textseq_Info::TKey396         bool EqualAcc(const TKey& b) const {
397             return SameHashNoVer(b) &&
398                 NStr::EqualNocase(GetAccPrefix(), b.GetAccPrefix());
399         }
400 
IsSetVersionCSeq_id_Textseq_Info::TKey401         bool IsSetVersion(void) const {
402             return (m_Hash & 1) != 0;
403         }
GetVersionCSeq_id_Textseq_Info::TKey404         const TVersion& GetVersion(void) const {
405             _ASSERT(IsSetVersion());
406             return m_Version;
407         }
ResetVersionCSeq_id_Textseq_Info::TKey408         void ResetVersion(void) {
409             m_Hash &= ~1;
410             m_Version = 0;
411         }
SetVersionCSeq_id_Textseq_Info::TKey412         void SetVersion(TVersion version) {
413             m_Hash |= 1;
414             m_Version = version;
415         }
GetAccDigitsCSeq_id_Textseq_Info::TKey416         int GetAccDigits(void) const {
417             return (m_Hash & 0xff) >> 1;
418         }
419         TVariant ParseCaseVariant(const string& acc) const;
420 
GetPrefixLenCSeq_id_Textseq_Info::TKey421         size_t GetPrefixLen() const {
422             return m_PrefixLen;
423         }
GetAccPrefixCSeq_id_Textseq_Info::TKey424         CTempString GetAccPrefix(void) const {
425             return CTempString(m_PrefixBuf, m_PrefixLen);
426         }
427     };
428     CSeq_id_Textseq_Info(CSeq_id::E_Choice type,
429                          CSeq_id_Mapper* mapper,
430                          const TKey& key);
431     ~CSeq_id_Textseq_Info(void);
432 
GetKey(void) const433     const TKey& GetKey(void) const {
434         return m_Key;
435     }
GetAccPrefix(void) const436     CTempString GetAccPrefix(void) const {
437         return m_Key.GetAccPrefix();
438     }
GoodPrefix(const CTempString & acc) const439     bool GoodPrefix(const CTempString& acc) const {
440         return NStr::StartsWith(acc, GetAccPrefix(), NStr::eNocase);
441     }
GetAccDigits(void) const442     int GetAccDigits(void) const {
443         return m_Key.GetAccDigits();
444     }
IsSetVersion(void) const445     bool IsSetVersion(void) const {
446         return m_Key.IsSetVersion();
447     }
GetVersion(void) const448     const TVersion& GetVersion(void) const {
449         return m_Key.GetVersion();
450     }
451     void RestoreAccession(string& acc, TPacked param, TVariant variant) const;
452     void Restore(CTextseq_id& id, TPacked param, TVariant variant) const;
453 
454     static TKey ParseAcc(const string& acc, const TVersion* ver);
ParseAcc(const string & acc,const CTextseq_id & tid)455     static TKey ParseAcc(const string& acc, const CTextseq_id& tid) {
456         TVersion ver;
457         const TVersion *ver_ptr = 0;
458         if ( tid.IsSetVersion() ) {
459             ver = tid.GetVersion();
460             ver_ptr = &ver;
461         }
462         return ParseAcc(acc, ver_ptr);
463     }
464     static TPacked Pack(const TKey& key, const string& acc);
465     static TPacked Pack(const TKey& key, const CTextseq_id& id);
466     static TVariant ParseCaseVariant(const CSeq_id_Info* info, const string& acc);
467 
468     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
469 
470 private:
471     TKey m_Key;
472 };
473 
474 
475 class CSeq_id_Textseq_Tree : public CSeq_id_Which_Tree
476 {
477 public:
478     typedef CTextseq_id::TVersion TVersion;
479 
480     CSeq_id_Textseq_Tree(CSeq_id_Mapper* mapper, CSeq_id::E_Choice type);
481     ~CSeq_id_Textseq_Tree(void);
482 
483     virtual bool Empty(void) const;
484 
485     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
486     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
487 
488     virtual bool HaveMatch(const CSeq_id_Handle& id) const;
489     virtual void FindMatch(const CSeq_id_Handle& id,
490                            TSeq_id_MatchList& id_list) const;
491     virtual void FindMatchStr(const string& sid,
492                               TSeq_id_MatchList& id_list) const;
493 
494     virtual bool Match(const CSeq_id_Handle& h1,
495                        const CSeq_id_Handle& h2) const;
496     virtual bool IsBetterVersion(const CSeq_id_Handle& h1,
497                                  const CSeq_id_Handle& h2) const;
498 
499     virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const;
500     virtual void FindReverseMatch(const CSeq_id_Handle& id,
501                                   TSeq_id_MatchList& id_list);
502 
503     virtual size_t Dump(CNcbiOstream& out,
504                         CSeq_id::E_Choice type,
505                         int details) const;
506 
507 protected:
508     virtual void x_Unindex(const CSeq_id_Info* info);
509     virtual bool x_Check(const CSeq_id::E_Choice& type) const;
510     virtual bool x_Check(const CSeq_id& id) const;
x_Get(const CSeq_id & id) const511     const CTextseq_id& x_Get(const CSeq_id& id) const {
512         const CTextseq_id* text_id = id.GetTextseq_Id();
513         _ASSERT(text_id);
514         return *text_id;
515     }
516     CSeq_id_Textseq_PlainInfo* x_FindStrInfo(CSeq_id::E_Choice type,
517                                              const CTextseq_id& tid) const;
518     bool x_GetVersion(TVersion& version, const CSeq_id_Handle& id) const;
519 
520 private:
521     typedef multimap<string, CSeq_id_Textseq_PlainInfo*, PNocase> TStringMap;
522     typedef TStringMap::value_type TStringMapValue;
523     typedef TStringMap::const_iterator TStringMapCI;
524     typedef pair<TStringMapCI, TStringMapCI> TVersions;
525     typedef CSeq_id_Textseq_Info::TKey TPackedKey;
526     typedef map<TPackedKey, CConstRef<CSeq_id_Textseq_Info> > TPackedMap;
527     typedef TPackedMap::value_type TPackedMapValue;
528     typedef TPackedMap::iterator TPackedMap_I;
529     typedef TPackedMap::const_iterator TPackedMap_CI;
530 
531     static bool x_Equals(const CTextseq_id& id1, const CTextseq_id& id2);
532     static void x_Erase(TStringMap& str_map,
533                         const string& key,
534                         const CSeq_id_Info* info);
535 
536     CSeq_id_Textseq_PlainInfo* x_FindStrInfo(const TStringMap& str_map,
537                                              const string& str,
538                                              CSeq_id::E_Choice type,
539                                              const CTextseq_id& tid) const;
540 
541     void x_FindMatchByAcc(TSeq_id_MatchList& id_list,
542                           const string& acc,
543                           const TVersion* ver = 0) const;
x_FindMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const CTextseq_id * tid) const544     void x_FindMatchByAcc(TSeq_id_MatchList& id_list,
545                           const string& acc,
546                           const CTextseq_id* tid) const {
547         TVersion ver, *ver_ptr = 0;
548         if ( tid && tid->IsSetVersion() ) {
549             ver = tid->GetVersion();
550             ver_ptr = &ver;
551         }
552         x_FindMatchByAcc(id_list, acc, ver_ptr);
553     }
554     void x_FindMatchByName(TSeq_id_MatchList& id_list,
555                            const string& name,
556                            const CTextseq_id* tid = 0) const;
557 
558     void x_FindRevMatchByAccPacked(TSeq_id_MatchList& id_list,
559                                    const string& acc,
560                                    const TVersion* ver = 0) const;
561     void x_FindRevMatchByAccNonPacked(TSeq_id_MatchList& id_list,
562                                       const string& acc,
563                                       const TVersion* ver = 0) const;
564     void x_FindRevMatchByAcc(TSeq_id_MatchList& id_list,
565                              const string& acc,
566                              const TVersion* ver = 0) const;
x_FindRevMatchByAcc(TSeq_id_MatchList & id_list,const string & acc,const CTextseq_id * tid) const567     void x_FindRevMatchByAcc(TSeq_id_MatchList& id_list,
568                              const string& acc,
569                              const CTextseq_id* tid) const {
570         TVersion ver, *ver_ptr = 0;
571         if ( tid && tid->IsSetVersion() ) {
572             ver = tid->GetVersion();
573             ver_ptr = &ver;
574         }
575         x_FindRevMatchByAcc(id_list, acc, ver_ptr);
576     }
577     void x_FindRevMatchByName(TSeq_id_MatchList& id_list,
578                               const string& name,
579                               const CTextseq_id* tid = 0) const;
580 
581     CSeq_id::E_Choice m_Type;
582     TStringMap m_ByAcc;
583     TStringMap m_ByName; // Used for searching by string
584     TPackedMap m_PackedMap;
585 };
586 
587 
588 ////////////////////////////////////////////////////////////////////
589 // Genbank, EMBL and DDBJ joint tree
590 
591 
592 class CSeq_id_GB_Tree : public CSeq_id_Textseq_Tree
593 {
594 public:
595     CSeq_id_GB_Tree(CSeq_id_Mapper* mapper);
596 protected:
597     virtual bool x_Check(const CSeq_id::E_Choice& type) const;
598 };
599 
600 
601 ////////////////////////////////////////////////////////////////////
602 // Pir tree
603 
604 
605 class CSeq_id_Pir_Tree : public CSeq_id_Textseq_Tree
606 {
607 public:
608     CSeq_id_Pir_Tree(CSeq_id_Mapper* mapper);
609 };
610 
611 
612 ////////////////////////////////////////////////////////////////////
613 // Swissprot
614 
615 
616 class CSeq_id_Swissprot_Tree : public CSeq_id_Textseq_Tree
617 {
618 public:
619     CSeq_id_Swissprot_Tree(CSeq_id_Mapper* mapper);
620 };
621 
622 
623 ////////////////////////////////////////////////////////////////////
624 // Prf tree
625 
626 
627 class CSeq_id_Prf_Tree : public CSeq_id_Textseq_Tree
628 {
629 public:
630     CSeq_id_Prf_Tree(CSeq_id_Mapper* mapper);
631 };
632 
633 
634 ////////////////////////////////////////////////////////////////////
635 // Tpg tree
636 
637 
638 class CSeq_id_Tpg_Tree : public CSeq_id_Textseq_Tree
639 {
640 public:
641     CSeq_id_Tpg_Tree(CSeq_id_Mapper* mapper);
642 };
643 
644 
645 ////////////////////////////////////////////////////////////////////
646 // Tpe tree
647 
648 
649 class CSeq_id_Tpe_Tree : public CSeq_id_Textseq_Tree
650 {
651 public:
652     CSeq_id_Tpe_Tree(CSeq_id_Mapper* mapper);
653 };
654 
655 
656 ////////////////////////////////////////////////////////////////////
657 // Tpd tree
658 
659 
660 class CSeq_id_Tpd_Tree : public CSeq_id_Textseq_Tree
661 {
662 public:
663     CSeq_id_Tpd_Tree(CSeq_id_Mapper* mapper);
664 };
665 
666 
667 ////////////////////////////////////////////////////////////////////
668 // Gpipe tree
669 
670 
671 class CSeq_id_Gpipe_Tree : public CSeq_id_Textseq_Tree
672 {
673 public:
674     CSeq_id_Gpipe_Tree(CSeq_id_Mapper* mapper);
675 };
676 
677 
678 ////////////////////////////////////////////////////////////////////
679 // Named-annot-track tree
680 
681 
682 class CSeq_id_Named_annot_track_Tree : public CSeq_id_Textseq_Tree
683 {
684 public:
685     CSeq_id_Named_annot_track_Tree(CSeq_id_Mapper* mapper);
686 };
687 
688 
689 ////////////////////////////////////////////////////////////////////
690 // Other tree
691 
692 
693 class CSeq_id_Other_Tree : public CSeq_id_Textseq_Tree
694 {
695 public:
696     CSeq_id_Other_Tree(CSeq_id_Mapper* mapper);
697 };
698 
699 
700 ////////////////////////////////////////////////////////////////////
701 // e_Local tree
702 
703 
704 class CSeq_id_Local_Info : public CSeq_id_Info {
705 public:
706     CSeq_id_Local_Info(const CObject_id& oid, CSeq_id_Mapper* mapper);
707     ~CSeq_id_Local_Info(void);
708 
IsId() const709     bool IsId() const {
710         return m_IsId;
711     }
HasMatchingId() const712     bool HasMatchingId() const {
713         return m_HasMatchingId;
714     }
GetMatchingId() const715     CObject_id::TId GetMatchingId() const {
716         return m_MatchingId;
717     }
718 
719     TVariant ParseCaseVariant(const string& str) const;
720     TVariant ParseCaseVariant(const CObject_id& oid) const;
721 
722     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
723 
724 private:
725     bool m_IsId;
726     bool m_HasMatchingId;
727     CObject_id::TId m_MatchingId;
728 };
729 
730 
731 class CSeq_id_Local_Tree : public CSeq_id_Which_Tree
732 {
733 public:
734     CSeq_id_Local_Tree(CSeq_id_Mapper* mapper);
735     ~CSeq_id_Local_Tree(void);
736 
737     virtual bool Empty(void) const;
738 
739     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
740     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
741 
742     // Get the list of matching seq-id (int id = str id).
743     virtual bool HaveMatch(const CSeq_id_Handle& id) const;
744     virtual void FindMatch(const CSeq_id_Handle& id,
745                            TSeq_id_MatchList& id_list) const;
746     virtual void FindMatchStr(const string& sid,
747                               TSeq_id_MatchList& id_list) const;
748 
749     virtual size_t Dump(CNcbiOstream& out,
750                         CSeq_id::E_Choice type,
751                         int details) const;
752 
753 private:
754     virtual void x_Unindex(const CSeq_id_Info* info);
755     CSeq_id_Local_Info* x_FindInfo(const CObject_id& oid) const;
756     CSeq_id_Local_Info* x_FindStrInfo(const string& str) const;
757     CSeq_id_Local_Info* x_FindIdInfo(CObject_id::TId id) const;
758 
759     typedef unordered_map<string, CSeq_id_Local_Info*, PHashNocase, PEqualNocase> TByStr;
760     typedef map<CObject_id::TId, CSeq_id_Local_Info*>         TById;
761 
762     TByStr m_ByStr;
763     TById  m_ById;
764 };
765 
766 
767 ////////////////////////////////////////////////////////////////////
768 // e_General tree
769 
770 
771 class CSeq_id_General_Id_Info : public CSeq_id_Info {
772 public:
773     typedef string TKey;
774     typedef PNocase PKeyLess;
775 
776     CSeq_id_General_Id_Info(CSeq_id_Mapper* mapper, const TKey& key);
777     ~CSeq_id_General_Id_Info(void);
778 
GetKey(void) const779     const TKey& GetKey(void) const {
780         return m_Key;
781     }
GetDbtag(void) const782     const string& GetDbtag(void) const {
783         return m_Key;
784     }
785     void Restore(CDbtag& id, TPacked param, TVariant variant) const;
786 
787     static TPacked Pack(const TKey& key, const CDbtag& id);
788 
789     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
790 
791 private:
792     TKey m_Key;
793 };
794 
795 
796 class CSeq_id_General_Str_Info : public CSeq_id_Info {
797 public:
798     struct TKey {
799         TPacked m_Key;
800         string m_Db;
801         string m_StrPrefix;
802         string m_StrSuffix;
operator ==CSeq_id_General_Str_Info::TKey803         bool operator==(const TKey& b) const {
804             return m_Key == b.m_Key &&
805                 PEqualNocase()(m_StrSuffix, b.m_StrSuffix) &&
806                 PEqualNocase()(m_StrPrefix, b.m_StrPrefix) &&
807                 PEqualNocase()(m_Db, b.m_Db);
808         }
operator !=CSeq_id_General_Str_Info::TKey809         bool operator!=(const TKey& b) const {
810             return !(*this == b);
811         }
GetStrDigitsCSeq_id_General_Str_Info::TKey812         size_t GetStrDigits(void) const {
813             return m_Key & 0xff;
814         }
815         TVariant ParseCaseVariant(const CDbtag& dbtag) const;
816     };
817     struct PKeyLess {
operator ()CSeq_id_General_Str_Info::PKeyLess818         bool operator()(const TKey& a, const TKey& b) const {
819             if ( a.m_Key != b.m_Key ) {
820                 return a.m_Key < b.m_Key;
821             }
822             int diff = NStr::CompareNocase(a.m_StrSuffix, b.m_StrSuffix);
823             if ( diff == 0 ) {
824                 diff = NStr::CompareNocase(a.m_StrPrefix, b.m_StrPrefix);
825                 if ( diff == 0 ) {
826                     diff = NStr::CompareNocase(a.m_Db, b.m_Db);
827                 }
828             }
829             return diff < 0;
830         }
831     };
832     struct PHash {
operator ()CSeq_id_General_Str_Info::PHash833         TPacked operator()(const TKey& a) const {
834             return a.m_Key;
835         }
836     };
837 
838     CSeq_id_General_Str_Info(CSeq_id_Mapper* mapper, const TKey& key);
839     ~CSeq_id_General_Str_Info(void);
840 
GetKey(void) const841     const TKey& GetKey(void) const {
842         return m_Key;
843     }
GetDbtag(void) const844     const string& GetDbtag(void) const {
845         return m_Key.m_Db;
846     }
GetStrPrefix(void) const847     const string& GetStrPrefix(void) const {
848         return m_Key.m_StrPrefix;
849     }
GetStrSuffix(void) const850     const string& GetStrSuffix(void) const {
851         return m_Key.m_StrSuffix;
852     }
GetStrDigits(void) const853     size_t GetStrDigits(void) const {
854         return m_Key.GetStrDigits();
855     }
856     void Restore(CDbtag& id, TPacked param, TVariant variant) const;
857 
858     static TKey Parse(const CDbtag& id);
859     static TPacked Pack(const TKey& key, const CDbtag& id);
860 
861     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
862 
863 private:
864     TKey m_Key;
865 };
866 
867 
868 class CSeq_id_General_PlainInfo : public CSeq_id_Info {
869 public:
870     CSeq_id_General_PlainInfo(const CDbtag& dbid, CSeq_id_Mapper* mapper);
871 
872     virtual CConstRef<CSeq_id> GetPackedSeqId(TPacked packed, TVariant variant) const;
873 
874     TVariant ParseCaseVariant(const CDbtag& dbtag) const;
875 };
876 
877 
878 class CSeq_id_General_Tree : public CSeq_id_Which_Tree
879 {
880 public:
881     CSeq_id_General_Tree(CSeq_id_Mapper* mapper);
882     ~CSeq_id_General_Tree(void);
883 
884     virtual bool Empty(void) const;
885 
886     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
887     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
888 
889     // Get the list of matching seq-id (int id = str id).
890     virtual bool HaveMatch(const CSeq_id_Handle& id) const;
891     virtual void FindMatch(const CSeq_id_Handle& id,
892                            TSeq_id_MatchList& id_list) const;
893     virtual void FindMatchStr(const string& sid,
894                               TSeq_id_MatchList& id_list) const;
895 
896     virtual size_t Dump(CNcbiOstream& out,
897                         CSeq_id::E_Choice type,
898                         int details) const;
899 
900 private:
901     virtual void x_Unindex(const CSeq_id_Info* info);
902     CSeq_id_General_PlainInfo* x_FindInfo(const CDbtag& dbid) const;
903 
904     struct STagMap {
905     public:
906         typedef unordered_map<string, CSeq_id_General_PlainInfo*, PHashNocase, PEqualNocase> TByStr;
907         typedef map<TPacked, CSeq_id_General_PlainInfo*>         TById;
908         TByStr m_ByStr;
909         TById  m_ById;
910     };
911     typedef map<string, STagMap, PNocase> TDbMap;
912     typedef CSeq_id_General_Id_Info::TKey TPackedIdKey;
913     typedef map<TPackedIdKey, CConstRef<CSeq_id_General_Id_Info>,
914                 CSeq_id_General_Id_Info::PKeyLess> TPackedIdMap;
915     typedef CSeq_id_General_Str_Info::TKey TPackedStrKey;
916     typedef unordered_map<TPackedStrKey, CConstRef<CSeq_id_General_Str_Info>,
917                           CSeq_id_General_Str_Info::PHash> TPackedStrMap;
918 
919     TDbMap m_DbMap;
920     TPackedIdMap m_PackedIdMap;
921     TPackedStrMap m_PackedStrMap;
922 };
923 
924 
925 ////////////////////////////////////////////////////////////////////
926 // e_Giim tree
927 
928 
929 class CSeq_id_Giim_Tree : public CSeq_id_Which_Tree
930 {
931 public:
932     CSeq_id_Giim_Tree(CSeq_id_Mapper* mapper);
933     ~CSeq_id_Giim_Tree(void);
934 
935     virtual bool Empty(void) const;
936 
937     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
938     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
939 
940     virtual void FindMatchStr(const string& sid,
941                               TSeq_id_MatchList& id_list) const;
942 
943     virtual size_t Dump(CNcbiOstream& out,
944                         CSeq_id::E_Choice type,
945                         int details) const;
946 
947 private:
948     virtual void x_Unindex(const CSeq_id_Info* info);
949     CSeq_id_Info* x_FindInfo(const CGiimport_id& gid) const;
950 
951     // 2-level indexing: first by Id, second by Db+Release
952     typedef vector<CSeq_id_Info*> TGiimList;
953     typedef map<TPacked, TGiimList> TIdMap;
954 
955     TIdMap m_IdMap;
956 };
957 
958 
959 ////////////////////////////////////////////////////////////////////
960 // e_Patent tree
961 
962 
963 class CSeq_id_Patent_Tree : public CSeq_id_Which_Tree
964 {
965 public:
966     CSeq_id_Patent_Tree(CSeq_id_Mapper* mapper);
967     ~CSeq_id_Patent_Tree(void);
968 
969     virtual bool Empty(void) const;
970 
971     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
972     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
973 
974     virtual void FindMatchStr(const string& sid,
975                               TSeq_id_MatchList& id_list) const;
976 
977     virtual size_t Dump(CNcbiOstream& out,
978                         CSeq_id::E_Choice type,
979                         int details) const;
980 
981 private:
982     virtual void x_Unindex(const CSeq_id_Info* info);
983     CSeq_id_Info* x_FindInfo(const CPatent_seq_id& pid) const;
984 
985     // 3-level indexing: country, (number|app_number), seqid.
986     // Ignoring patent doc-type in indexing.
987     struct SPat_idMap {
988         typedef map<TPacked, CSeq_id_Info*> TBySeqid;
989         typedef map<string, TBySeqid, PNocase> TByNumber; // or by App_number
990 
991         TByNumber m_ByNumber;
992         TByNumber m_ByApp_number;
993     };
994     typedef map<string, SPat_idMap, PNocase> TByCountry;
995 
996     TByCountry m_CountryMap;
997 };
998 
999 
1000 ////////////////////////////////////////////////////////////////////
1001 // e_PDB tree
1002 
1003 
1004 class CSeq_id_PDB_Tree : public CSeq_id_Which_Tree
1005 {
1006 public:
1007     CSeq_id_PDB_Tree(CSeq_id_Mapper* mapper);
1008     ~CSeq_id_PDB_Tree(void);
1009 
1010     virtual bool Empty(void) const;
1011 
1012     virtual CSeq_id_Handle FindInfo(const CSeq_id& id) const;
1013     virtual CSeq_id_Handle FindOrCreate(const CSeq_id& id);
1014 
1015     virtual bool HaveMatch(const CSeq_id_Handle& id) const;
1016     virtual void FindMatch(const CSeq_id_Handle& id,
1017                            TSeq_id_MatchList& id_list) const;
1018     virtual void FindMatchStr(const string& sid,
1019                               TSeq_id_MatchList& id_list) const;
1020     virtual bool HaveReverseMatch(const CSeq_id_Handle& id) const;
1021     virtual void FindReverseMatch(const CSeq_id_Handle& id,
1022                                   TSeq_id_MatchList& id_list);
1023 
1024     virtual size_t Dump(CNcbiOstream& out,
1025                         CSeq_id::E_Choice type,
1026                         int details) const;
1027 
1028 private:
1029     virtual void x_Unindex(const CSeq_id_Info* info);
1030 
1031     string x_IdToStrKey(const CPDB_seq_id& id) const;
1032 
1033     // Index by mol+chain, no date - too complicated
1034     typedef vector<CSeq_id_Info*>  TSubMolList;
1035     typedef map<string, TSubMolList, PCase> TMolMap;
1036 
1037     TMolMap m_MolMap;
1038 };
1039 
1040 
1041 // Seq-id mapper exception
1042 class NCBI_SEQ_EXPORT CSeq_id_MapperException : public CException
1043 {
1044 public:
1045     enum EErrCode {
1046         eTypeError,
1047         eSymbolError,
1048         eEmptyError,
1049         eOtherError
1050     };
1051     const char* GetErrCodeString(void) const override;
1052     NCBI_EXCEPTION_DEFAULT(CSeq_id_MapperException,CException);
1053 };
1054 
1055 
1056 /////////////////////////////////////////////////////////////////////////////
1057 //
1058 // Inline methods
1059 //
1060 /////////////////////////////////////////////////////////////////////////////
1061 
1062 END_SCOPE(objects)
1063 END_NCBI_SCOPE
1064 
1065 #endif  /* OBJECTS_OBJMGR___SEQ_ID_TREE__HPP */
1066