1 /*  $Id: wgsresolver_impl.cpp 632486 2021-06-02 11:14:23Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Eugene Vasilchenko
27  *
28  * File Description:
29  *   Resolve WGS accessions
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/sra/impl/wgsresolver_impl.hpp>
35 #include <sra/readers/ncbi_traces_path.hpp>
36 #include <corelib/ncbifile.hpp>
37 #include <corelib/ncbi_param.hpp>
38 #include <util/line_reader.hpp>
39 #include <sra/error_codes.hpp>
40 
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objects/general/Dbtag.hpp>
43 #include <objmgr/object_manager.hpp>
44 #include <objmgr/data_loader.hpp>
45 
46 #include <objects/id2/id2processor.hpp>
47 
48 #ifdef WGS_RESOLVER_USE_ID2_CLIENT
49 # include <objects/id2/id2__.hpp>
50 # include <objects/id2/id2_client.hpp>
51 #endif
52 
53 BEGIN_NCBI_NAMESPACE;
54 
55 #define NCBI_USE_ERRCODE_X   WGSResolver
56 NCBI_DEFINE_ERR_SUBCODE_X(32);
57 
58 BEGIN_NAMESPACE(objects);
59 
60 
61 #define DEFAULT_WGS_INDEX_ACC "ZZZZ99"
62 #define DEFAULT_WGS_INDEX2_ACC "ZZZZ98"
63 #define DEFAULT_WGS_INDEX3_ACC "ZZZZ97"
64 #define DEFAULT_WGS_INDEX_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ99"
65 #define DEFAULT_WGS_INDEX_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX"
66 #define DEFAULT_WGS_INDEX2_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ98"
67 #define DEFAULT_WGS_INDEX2_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX_V2"
68 #define DEFAULT_WGS_INDEX3_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ97"
69 #define DEFAULT_WGS_INDEX3_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX_V3"
70 
71 #define DEFAULT_WGS_RANGE_INDEX_ACC "ZZZZ79"
72 #define DEFAULT_WGS_RANGE_INDEX2_ACC "ZZZZ78"
73 #define DEFAULT_WGS_RANGE_INDEX_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ79"
74 #define DEFAULT_WGS_RANGE_INDEX_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_RANGE_INDEX_1"
75 #define DEFAULT_WGS_RANGE_INDEX2_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ78"
76 #define DEFAULT_WGS_RANGE_INDEX2_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_RANGE_INDEX_2"
77 
78 
79 NCBI_PARAM_DECL(bool, WGS, RESOLVER_DIRECT_WGS_INDEX);
80 NCBI_PARAM_DEF(bool, WGS, RESOLVER_DIRECT_WGS_INDEX, true);
81 
82 NCBI_PARAM_DECL(bool, WGS, RESOLVER_GENBANK);
83 NCBI_PARAM_DEF(bool, WGS, RESOLVER_GENBANK, true);
84 
85 NCBI_PARAM_DECL(bool, WGS, RESOLVER_WGS_RANGE_INDEX);
86 NCBI_PARAM_DEF(bool, WGS, RESOLVER_WGS_RANGE_INDEX, true);
87 
s_UseWGSRangeIndex(void)88 static inline bool s_UseWGSRangeIndex(void)
89 {
90     static bool value = NCBI_PARAM_TYPE(WGS, RESOLVER_WGS_RANGE_INDEX)::GetDefault();
91     return value;
92 }
93 
94 NCBI_PARAM_DECL(string, WGS, WGS_INDEX);
95 NCBI_PARAM_DEF(string, WGS, WGS_INDEX, "");
96 
97 
98 NCBI_PARAM_DECL(string, WGS, WGS_INDEX2);
99 NCBI_PARAM_DEF(string, WGS, WGS_INDEX2, "");
100 
101 
102 NCBI_PARAM_DECL(string, WGS, WGS_INDEX3);
103 NCBI_PARAM_DEF(string, WGS, WGS_INDEX3, "");
104 
105 
106 NCBI_PARAM_DECL(string, WGS, WGS_INDEX_ACC);
107 NCBI_PARAM_DEF(string, WGS, WGS_INDEX_ACC, DEFAULT_WGS_INDEX_ACC);
108 
109 
110 NCBI_PARAM_DECL(string, WGS, WGS_INDEX2_ACC);
111 NCBI_PARAM_DEF(string, WGS, WGS_INDEX2_ACC, DEFAULT_WGS_INDEX2_ACC);
112 
113 
114 NCBI_PARAM_DECL(string, WGS, WGS_INDEX3_ACC);
115 NCBI_PARAM_DEF(string, WGS, WGS_INDEX3_ACC, DEFAULT_WGS_INDEX3_ACC);
116 
117 
118 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX);
119 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX, "");
120 
121 
122 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX2);
123 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX2, "");
124 
125 
126 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX_ACC);
127 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX_ACC, DEFAULT_WGS_RANGE_INDEX_ACC);
128 
129 
130 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX2_ACC);
131 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX2_ACC, DEFAULT_WGS_RANGE_INDEX2_ACC);
132 
133 
134 // debug levels
135 enum EDebugLevel {
136     eDebug_none     = 0,
137     eDebug_error    = 1,
138     eDebug_open     = 2,
139     eDebug_request  = 5,
140     eDebug_replies  = 6,
141     eDebug_resolve  = 7,
142     eDebug_data     = 8,
143     eDebug_all      = 9
144 };
145 
146 NCBI_PARAM_DECL(int, WGS, DEBUG_RESOLVE);
147 NCBI_PARAM_DEF_EX(int, WGS, DEBUG_RESOLVE, eDebug_error,
148                   eParam_NoThread, WGS_DEBUG_RESOLVE);
149 
s_DebugLevel(void)150 static inline int s_DebugLevel(void)
151 {
152     static CSafeStatic<NCBI_PARAM_TYPE(WGS, DEBUG_RESOLVE)> s_Value;
153     return s_Value->Get();
154 }
155 
156 
s_DebugEnabled(EDebugLevel level)157 static inline bool s_DebugEnabled(EDebugLevel level)
158 {
159     return s_DebugLevel() >= level;
160 }
161 
162 
163 //#define COLLECT_PROFILE
164 #ifdef COLLECT_PROFILE
165 struct SProfiler
166 {
167     const char* name;
168     size_t count;
169     CStopWatch sw;
SProfilerSProfiler170     SProfiler() : name(0), count(0) {}
~SProfilerSProfiler171     ~SProfiler() {
172         if ( name )
173             cout << name<<" calls: "<<count<<" time: "<<sw.Elapsed()<<endl;
174     }
175 };
176 struct SProfilerGuard
177 {
178     SProfiler& sw;
SProfilerGuardSProfilerGuard179     SProfilerGuard(SProfiler& sw, const char* name)
180         : sw(sw)
181         {
182             sw.name = name;
183             sw.count += 1;
184             sw.sw.Start();
185         }
~SProfilerGuardSProfilerGuard186     ~SProfilerGuard()
187         {
188             sw.sw.Stop();
189         }
190 };
191 
192 static SProfiler sw_AccFind;
193 static SProfiler sw_AccRange;
194 static SProfiler sw_WGSPrefix;
195 
196 # define PROFILE(var) SProfilerGuard guard(var, #var)
197 #else
198 # define PROFILE(var)
199 #endif
200 
201 /////////////////////////////////////////////////////////////////////////////
202 // CWGSResolver_VDB
203 /////////////////////////////////////////////////////////////////////////////
204 
205 
206 // SGiIdxTableCursor is helper accessor structure for optional GI_IDX table
207 struct CWGSResolver_VDB::SGiIdxTableCursor : public CObject {
208     explicit SGiIdxTableCursor(const CVDBTable& table);
209 
210     CVDBTable m_Table;
211     CVDBCursor m_Cursor;
212 
213     DECLARE_VDB_COLUMN_AS_STRING(WGS_PREFIX);
214 };
215 
216 
SGiIdxTableCursor(const CVDBTable & table)217 CWGSResolver_VDB::SGiIdxTableCursor::SGiIdxTableCursor(const CVDBTable& table)
218     : m_Table(table),
219       m_Cursor(table),
220       INIT_VDB_COLUMN(WGS_PREFIX)
221 {
222 }
223 
224 
225 // SAccIdxTableCursor is helper accessor structure for optional ACC_IDX table
226 struct CWGSResolver_VDB::SAccIdxTableCursor : public CObject {
227     explicit SAccIdxTableCursor(const CVDBTable& table);
228 
229     CVDBTable m_Table;
230     CVDBCursor m_Cursor;
231 
232     typedef Uint2 acc_range_number_t;
233     DECLARE_VDB_COLUMN_AS(acc_range_number_t, ACCESSION_RANGE);
234     DECLARE_VDB_COLUMN_AS_STRING(WGS_PREFIX);
235 };
236 
237 
SAccIdxTableCursor(const CVDBTable & table)238 CWGSResolver_VDB::SAccIdxTableCursor::SAccIdxTableCursor(const CVDBTable& table)
239     : m_Table(table),
240       m_Cursor(table),
241       INIT_OPTIONAL_VDB_COLUMN(ACCESSION_RANGE),
242       INIT_VDB_COLUMN(WGS_PREFIX)
243 {
244 }
245 
246 
GetDefaultWGSIndexPath(EIndexType index_type)247 string CWGSResolver_VDB::GetDefaultWGSIndexPath(EIndexType index_type)
248 {
249     if ( s_UseWGSRangeIndex() ) {
250         if ( index_type == eMainIndex ) {
251             return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX)::GetDefault();
252         }
253         else if ( index_type == eSecondIndex ) {
254             return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX2)::GetDefault();
255         }
256     }
257     else {
258         if ( index_type == eMainIndex ) {
259             return NCBI_PARAM_TYPE(WGS, WGS_INDEX)::GetDefault();
260         }
261         else if ( index_type == eSecondIndex ) {
262             return NCBI_PARAM_TYPE(WGS, WGS_INDEX2)::GetDefault();
263         }
264         else if ( index_type == eThirdIndex ) {
265             return NCBI_PARAM_TYPE(WGS, WGS_INDEX3)::GetDefault();
266         }
267     }
268     return string();
269 }
270 
271 
GetDefaultWGSIndexAcc(EIndexType index_type)272 string CWGSResolver_VDB::GetDefaultWGSIndexAcc(EIndexType index_type)
273 {
274     if ( s_UseWGSRangeIndex() ) {
275         if ( index_type == eMainIndex ) {
276             return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX_ACC)::GetDefault();
277         }
278         else if ( index_type == eSecondIndex ) {
279             return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX2_ACC)::GetDefault();
280         }
281     }
282     else {
283         if ( index_type == eMainIndex ) {
284             return NCBI_PARAM_TYPE(WGS, WGS_INDEX_ACC)::GetDefault();
285         }
286         else if ( index_type == eSecondIndex ) {
287             return NCBI_PARAM_TYPE(WGS, WGS_INDEX2_ACC)::GetDefault();
288         }
289         else if ( index_type == eThirdIndex ) {
290             return NCBI_PARAM_TYPE(WGS, WGS_INDEX3_ACC)::GetDefault();
291         }
292     }
293     return string();
294 }
295 
296 
297 static
GetDirectWGSIndexPath(CWGSResolver_VDB::EIndexType index_type)298 string GetDirectWGSIndexPath(CWGSResolver_VDB::EIndexType index_type)
299 {
300     string path;
301     if ( NCBI_PARAM_TYPE(WGS, RESOLVER_DIRECT_WGS_INDEX)::GetDefault() ) {
302         const char* path1 = 0;
303         const char* path2 = 0;
304         if ( s_UseWGSRangeIndex() ) {
305             if ( index_type == CWGSResolver_VDB::eMainIndex ) {
306                 path1 = DEFAULT_WGS_RANGE_INDEX_PATH1;
307                 path2 = DEFAULT_WGS_RANGE_INDEX_PATH2;
308             }
309             else if ( index_type == CWGSResolver_VDB::eSecondIndex ) {
310                 path1 = DEFAULT_WGS_RANGE_INDEX2_PATH1;
311                 path2 = DEFAULT_WGS_RANGE_INDEX2_PATH2;
312             }
313         }
314         else {
315             if ( index_type == CWGSResolver_VDB::eMainIndex ) {
316                 path1 = DEFAULT_WGS_INDEX_PATH1;
317                 path2 = DEFAULT_WGS_INDEX_PATH2;
318             }
319             else if ( index_type == CWGSResolver_VDB::eSecondIndex ) {
320                 path1 = DEFAULT_WGS_INDEX2_PATH1;
321                 path2 = DEFAULT_WGS_INDEX2_PATH2;
322             }
323             else if ( index_type == CWGSResolver_VDB::eThirdIndex ) {
324                 path1 = DEFAULT_WGS_INDEX3_PATH1;
325                 path2 = DEFAULT_WGS_INDEX3_PATH2;
326             }
327         }
328         if ( path1 && CDirEntry(path1).Exists() ) {
329             path = path1;
330         }
331         else if ( path2 && CDirEntry(path2).Exists() ) {
332             path = path2;
333         }
334     }
335     return path;
336 }
337 
338 
CWGSResolver_VDB(const CVDBMgr & mgr,EIndexType index_type,CWGSResolver_VDB * next_resolver)339 CWGSResolver_VDB::CWGSResolver_VDB(const CVDBMgr& mgr,
340                                    EIndexType index_type,
341                                    CWGSResolver_VDB* next_resolver)
342     : m_NextResolver(next_resolver)
343 {
344     string path = GetDefaultWGSIndexPath(index_type);
345     if ( path.empty() ) {
346         string acc = GetDefaultWGSIndexAcc(index_type);
347         // no user-defined index path, try default locations
348         // first try to open index by predefined accession, maybe remotely
349         Open(mgr, acc);
350         if ( IsValid() ) {
351             // opened
352             return;
353         }
354         // then try to open index by direct file acces, only locally
355         path = GetDirectWGSIndexPath(index_type);
356         if ( path.empty() ) {
357             // VDB index is not available
358             return;
359         }
360     }
361     if ( path.find_first_of("\\/") != NPOS && !CDirEntry(path).Exists() ) {
362         // not an accession (has directory separators) and not a file
363         if ( s_DebugEnabled(eDebug_error) ) {
364             ERR_POST_X(9, "CWGSResolver_VDB: cannot find index file: "<<path);
365         }
366         return;
367     }
368     Open(mgr, path);
369 }
370 
371 
CWGSResolver_VDB(const CVDBMgr & mgr,const string & path,CWGSResolver_VDB * next_resolver)372 CWGSResolver_VDB::CWGSResolver_VDB(const CVDBMgr& mgr,
373                                    const string& path,
374                                    CWGSResolver_VDB* next_resolver)
375     : m_NextResolver(next_resolver)
376 {
377     Open(mgr, path);
378 }
379 
380 
~CWGSResolver_VDB(void)381 CWGSResolver_VDB::~CWGSResolver_VDB(void)
382 {
383     Close();
384 }
385 
386 
CreateResolver(const CVDBMgr & mgr)387 CRef<CWGSResolver> CWGSResolver_VDB::CreateResolver(const CVDBMgr& mgr)
388 {
389     CRef<CWGSResolver_VDB> ret(new CWGSResolver_VDB(mgr, eMainIndex));
390     if ( !ret->IsValid() ) {
391         return null;
392     }
393     CRef<CWGSResolver_VDB> ret2(new CWGSResolver_VDB(mgr, eSecondIndex, ret));
394     if ( ret2->IsValid() ) {
395         ret = ret2;
396     }
397     if ( !ret->m_AccIndexIsPrefix ) {
398         CRef<CWGSResolver_VDB> ret3(new CWGSResolver_VDB(mgr, eThirdIndex, ret));
399         if ( ret3->IsValid() ) {
400             ret = ret3;
401         }
402     }
403     return CRef<CWGSResolver>(ret);
404 }
405 
406 
Close(void)407 void CWGSResolver_VDB::Close(void)
408 {
409     TDBMutex::TWriteLockGuard guard(m_DBMutex);
410     x_Close();
411 }
412 
413 
x_Close()414 void CWGSResolver_VDB::x_Close()
415 {
416     m_Mgr.Close();
417     m_Db.Close();
418     m_GiIdxTable.Close();
419     m_AccIdxTable.Close();
420     m_AccIndex.Close();
421     m_GiIdxCursorCache.Clear();
422     m_AccIdxCursorCache.Clear();
423 }
424 
425 
s_ResolveAccOrPath(const CVDBMgr & mgr,const string & acc_or_path)426 static string s_ResolveAccOrPath(const CVDBMgr& mgr, const string& acc_or_path)
427 {
428     string path;
429     if ( CVPath::IsPlainAccession(acc_or_path) ) {
430         // resolve VDB accessions
431         try {
432             path = mgr.FindAccPath(acc_or_path);
433             if ( s_DebugEnabled(eDebug_open) ) {
434                 LOG_POST_X(28, "CWGSResolver_VDB("<<acc_or_path<<"): -> "<<path);
435             }
436         }
437         catch ( CSraException& /*ignored*/ ) {
438             path = acc_or_path;
439         }
440     }
441     else {
442         // real path, http:, etc.
443         path = acc_or_path;
444     }
445 
446     // resolve symbolic links for correct timestamp and longer-living reference
447     CDirEntry de(path);
448     if ( de.Exists() ) {
449         de.DereferenceLink();
450         if ( de.GetPath() != path ) {
451             path = de.GetPath();
452             if ( s_DebugEnabled(eDebug_open) ) {
453                 LOG_POST_X(29, "CWGSResolver_VDB("<<acc_or_path<<"): "
454                            "resolved index link to "<<path);
455             }
456         }
457     }
458     return path;
459 }
460 
461 
Open(const CVDBMgr & mgr,const string & acc_or_path)462 void CWGSResolver_VDB::Open(const CVDBMgr& mgr, const string& acc_or_path)
463 {
464     string path = s_ResolveAccOrPath(mgr, acc_or_path);
465 
466     // open VDB file
467     TDBMutex::TWriteLockGuard guard(m_DBMutex);
468     x_Close();
469     m_Mgr = mgr;
470     try {
471         m_Db = CVDB(m_Mgr, path);
472     }
473     catch ( CSraException& exc ) {
474         if ( exc.GetErrCode() == exc.eNotFoundDb ) {
475             return;
476         }
477         throw;
478     }
479 
480     // save original argument for possible changes in symbolic links
481     m_WGSIndexPath = acc_or_path;
482     m_WGSIndexResolvedPath = path;
483     if ( !CDirEntry(path).GetTime(&m_Timestamp) ) {
484         m_Timestamp = CTime();
485     }
486     else {
487         if ( s_DebugEnabled(eDebug_open) ) {
488             LOG_POST_X(30, "CWGSResolver_VDB("<<acc_or_path<<"): index timestamp: "<<m_Timestamp);
489         }
490     }
491     m_GiIdxTable = CVDBTable(m_Db, "GI_IDX");
492     m_AccIdxTable = CVDBTable(m_Db, "ACC_IDX");
493     m_AccIndexIsPrefix = true;
494     m_AccIndex = CVDBTableIndex(m_AccIdxTable, "accession_prefix", CVDBTableIndex::eMissing_Allow);
495     if ( !m_AccIndex ) {
496         m_AccIndexIsPrefix = false;
497         m_AccIndex = CVDBTableIndex(m_AccIdxTable, "accession");
498     }
499 }
500 
501 
Reopen(void)502 void CWGSResolver_VDB::Reopen(void)
503 {
504     if ( CVDBMgr mgr = m_Mgr ) {
505         string path = GetWGSIndexPath();
506         Open(mgr, path);
507     }
508 }
509 
510 
Update(void)511 bool CWGSResolver_VDB::Update(void)
512 {
513     bool ret = x_Update();
514     if ( m_NextResolver && m_NextResolver->Update() ) {
515         ret = true;
516     }
517     return ret;
518 }
519 
520 
x_Update(void)521 bool CWGSResolver_VDB::x_Update(void)
522 {
523     string path = s_ResolveAccOrPath(m_Mgr, GetWGSIndexPath());
524     if ( path != GetWGSIndexResolvedPath() ) {
525         // resolved to a different path -> new index by symbolic link
526         LOG_POST_X(32, "CWGSResolver_VDB: new index path: "<<path);
527         Reopen();
528         return true;
529     }
530 
531     CTime timestamp;
532     if ( !CDirEntry(path).GetTime(&timestamp) ) {
533         // cannot get timestamp -> remote reference
534         return false;
535     }
536     if ( timestamp == m_Timestamp ) {
537         // same timestamp
538         return false;
539     }
540     if ( s_DebugEnabled(eDebug_open) ) {
541         LOG_POST_X(31, "CWGSResolver_VDB: new index timestamp: "<<timestamp);
542     }
543     Reopen();
544     return true;
545 }
546 
547 
548 inline
GiIdx(TIntId row)549 CRef<CWGSResolver_VDB::SGiIdxTableCursor> CWGSResolver_VDB::GiIdx(TIntId row)
550 {
551     CRef<SGiIdxTableCursor> curs = m_GiIdxCursorCache.Get(row);
552     if ( !curs ) {
553         curs = new SGiIdxTableCursor(GiIdxTable());
554     }
555     return curs;
556 }
557 
558 
559 inline
AccIdx(void)560 CRef<CWGSResolver_VDB::SAccIdxTableCursor> CWGSResolver_VDB::AccIdx(void)
561 {
562     CRef<SAccIdxTableCursor> curs = m_AccIdxCursorCache.Get();
563     if ( !curs ) {
564         curs = new SAccIdxTableCursor(AccIdxTable());
565     }
566     return curs;
567 }
568 
569 
570 inline
Put(CRef<SGiIdxTableCursor> & curs,TIntId row)571 void CWGSResolver_VDB::Put(CRef<SGiIdxTableCursor>& curs, TIntId row)
572 {
573     if ( curs->m_Table == GiIdxTable() ) {
574         m_GiIdxCursorCache.Put(curs, row);
575     }
576 }
577 
578 
579 inline
Put(CRef<SAccIdxTableCursor> & curs)580 void CWGSResolver_VDB::Put(CRef<SAccIdxTableCursor>& curs)
581 {
582     if ( curs->m_Table == AccIdxTable() ) {
583         m_AccIdxCursorCache.Put(curs);
584     }
585 }
586 
587 
GetPrefixes(TGi gi)588 CWGSResolver::TWGSPrefixes CWGSResolver_VDB::GetPrefixes(TGi gi)
589 {
590     TDBMutex::TReadLockGuard guard(m_DBMutex);
591     TWGSPrefixes ret;
592     if ( s_DebugEnabled(eDebug_resolve) ) {
593         LOG_POST_X(24, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): Resolving "<<gi);
594     }
595     CRef<SGiIdxTableCursor> cur = GiIdx();
596     CVDBStringValue value = cur->WGS_PREFIX(GI_TO(TVDBRowId, gi), CVDBValue::eMissing_Allow);
597     if ( !value.empty() ) {
598         if ( s_DebugEnabled(eDebug_resolve) ) {
599             LOG_POST_X(25, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): WGS prefix "<<*value);
600         }
601         ret.push_back(*value);
602     }
603     Put(cur);
604     if ( ret.empty() && m_NextResolver ) {
605         ret = m_NextResolver->GetPrefixes(gi);
606     }
607     return ret;
608 }
609 
610 
s_SplitAccIndex(string & uacc,Uint2 & key_num)611 static inline bool s_SplitAccIndex(string& uacc, Uint2& key_num)
612 {
613     size_t acc_len = uacc.size();
614     if ( acc_len <= 4 ) {
615         return false;
616     }
617     size_t prefix_len = acc_len-4;
618     unsigned v = 0;
619     for ( int i = 0; i < 4; ++i ) {
620         char c = uacc[prefix_len+i];
621         if ( c < '0' || c > '9' ) {
622             return false;
623         }
624         v = v*10 + (c-'0');
625     }
626     key_num = v;
627     uacc.erase(prefix_len);
628     return true;
629 }
630 
631 
GetPrefixes(const string & acc)632 CWGSResolver::TWGSPrefixes CWGSResolver_VDB::GetPrefixes(const string& acc)
633 {
634     TDBMutex::TReadLockGuard guard(m_DBMutex);
635     TWGSPrefixes ret;
636     if ( s_DebugEnabled(eDebug_resolve) ) {
637         LOG_POST_X(26, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): Resolving "<<acc);
638     }
639     string uacc = acc;
640     SAccIdxTableCursor::acc_range_number_t key_num = 0;
641     if ( m_AccIndexIsPrefix ) {
642         if ( !s_SplitAccIndex(uacc, key_num) ) {
643             if ( s_DebugEnabled(eDebug_resolve) ) {
644                 LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): invalid accession");
645             }
646             return ret;
647         }
648     }
649     NStr::ToUpper(uacc);
650     TVDBRowIdRange range;
651     {{
652         PROFILE(sw_AccFind);
653         range = m_AccIndex.Find(uacc);
654     }}
655     if ( s_DebugEnabled(eDebug_resolve) ) {
656         LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): "
657                    "range "<<range.first<<"-"<<range.second);
658     }
659     if ( range.second ) {
660         CRef<SAccIdxTableCursor> cur = AccIdx();
661         for ( TVDBRowCount i = 0; i < range.second; ++i ) {
662             TVDBRowId row_id = range.first+i;
663             if ( m_AccIndexIsPrefix ) {
664                 PROFILE(sw_AccRange);
665                 CVDBValueFor<SAccIdxTableCursor::acc_range_number_t> v =
666                     cur->ACCESSION_RANGE(row_id);
667                 if ( v[0] > key_num ) {
668                     // current range is past the requested id, end of scan
669                     break;
670                 }
671                 if ( v[1] < key_num ) {
672                     // current range is before the requested id, check next range
673                     continue;
674                 }
675             }
676             PROFILE(sw_WGSPrefix);
677             CTempString prefix = *cur->WGS_PREFIX(row_id);
678             if ( s_DebugEnabled(eDebug_resolve) ) {
679                 LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): WGS prefix "<<prefix);
680             }
681             ret.push_back(prefix);
682         }
683         Put(cur);
684     }
685     if ( ret.empty() && m_NextResolver ) {
686         ret = m_NextResolver->GetPrefixes(acc);
687     }
688     return ret;
689 }
690 
691 
692 /////////////////////////////////////////////////////////////////////////////
693 // CWGSResolver_Ids
694 /////////////////////////////////////////////////////////////////////////////
695 
696 
CWGSResolver_Ids(void)697 CWGSResolver_Ids::CWGSResolver_Ids(void)
698 {
699 }
700 
701 
~CWGSResolver_Ids(void)702 CWGSResolver_Ids::~CWGSResolver_Ids(void)
703 {
704 }
705 
706 
ParseWGSPrefix(const CDbtag & dbtag) const707 string CWGSResolver_Ids::ParseWGSPrefix(const CDbtag& dbtag) const
708 {
709     const string& db = dbtag.GetDb();
710     if ( (db.size() != 8 && db.size() != 10) ||
711          !NStr::StartsWith(db, "WGS:") ) {
712         return string();
713     }
714     string prefix = db.substr(4);
715     if ( prefix.size() == 4 ) {
716         prefix += "01";
717     }
718     _ASSERT(prefix.size() == 6);
719     for ( size_t i = 0; i < 4; ++i ) {
720         if ( !isupper(Uint1(prefix[i])) ) {
721             return string();
722         }
723     }
724     for ( size_t i = 4; i < 6; ++i ) {
725         if ( !isdigit(Uint1(prefix[i])) ) {
726             return string();
727         }
728     }
729     return prefix;
730 }
731 
732 
733 static const size_t kNumLetters = 4;
734 static const size_t kVersionDigits = 2;
735 static const size_t kPrefixLen = kNumLetters + kVersionDigits;
736 static const size_t kMinRowDigits = 6;
737 static const size_t kMaxRowDigits = 8;
738 
739 
ParseWGSAcc(const string & acc,bool protein) const740 string CWGSResolver_Ids::ParseWGSAcc(const string& acc, bool protein) const
741 {
742     if ( acc.size() < kPrefixLen + kMinRowDigits ||
743          acc.size() > kPrefixLen + kMaxRowDigits + 1 ) { // one for type letter
744         return string();
745     }
746     for ( size_t i = 0; i < kNumLetters; ++i ) {
747         if ( !isalpha(acc[i]&0xff) ) {
748             return string();
749         }
750     }
751     for ( size_t i = kNumLetters; i < kPrefixLen; ++i ) {
752         if ( !isdigit(acc[i]&0xff) ) {
753             return string();
754         }
755     }
756     SIZE_TYPE row_pos = kPrefixLen;
757     switch ( acc[row_pos] ) { // optional type letter
758     case 'S':
759         if ( protein ) {
760             return string();
761         }
762         ++row_pos;
763         break;
764     case 'P':
765         if ( !protein ) {
766             return string();
767         }
768         ++row_pos;
769         break;
770     default:
771         // it can be either contig or master sequence
772         if ( protein ) {
773             return string();
774         }
775         break;
776     }
777     for ( size_t i = row_pos; i < acc.size(); ++i ) {
778         char c = acc[i];
779         if ( c < '0' || c > '9' ) {
780             return string();
781         }
782     }
783     return acc.substr(0, kPrefixLen);
784 }
785 
786 
ParseWGSPrefix(const CTextseq_id & text_id) const787 string CWGSResolver_Ids::ParseWGSPrefix(const CTextseq_id& text_id) const
788 {
789     if ( text_id.IsSetName() ) {
790         // first try name reference if it has WGS format like AAAA01P000001
791         // as it directly contains WGS accession
792         string wgs_acc = ParseWGSAcc(text_id.GetName(), true);
793         if ( !wgs_acc.empty() ) {
794             return wgs_acc;
795         }
796     }
797     if ( text_id.IsSetAccession() ) {
798         const string& acc = text_id.GetAccession();
799         CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc);
800         if ( !(type & CSeq_id::fAcc_prot) ) {
801             switch ( type & CSeq_id::eAcc_division_mask ) {
802                 // accepted accession types
803             case CSeq_id::eAcc_wgs:
804             case CSeq_id::eAcc_wgs_intermed:
805             case CSeq_id::eAcc_tsa:
806                 return ParseWGSAcc(acc, false);
807             default:
808                 break;
809             }
810         }
811     }
812     return string();
813 }
814 
815 
ParseWGSPrefix(const CSeq_id & id) const816 string CWGSResolver_Ids::ParseWGSPrefix(const CSeq_id& id) const
817 {
818     if ( id.IsGeneral() ) {
819         return ParseWGSPrefix(id.GetGeneral());
820     }
821     else if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
822         return ParseWGSPrefix(*text_id);
823     }
824     return string();
825 }
826 
827 
GetPrefixes(TGi gi)828 CWGSResolver::TWGSPrefixes CWGSResolver_Ids::GetPrefixes(TGi gi)
829 {
830     CSeq_id seq_id;
831     seq_id.SetGi(gi);
832     return GetPrefixes(seq_id);
833 }
834 
835 
GetPrefixes(const string & acc)836 CWGSResolver::TWGSPrefixes CWGSResolver_Ids::GetPrefixes(const string& acc)
837 {
838     CSeq_id seq_id(acc);
839     return GetPrefixes(seq_id);
840 }
841 
842 
843 /////////////////////////////////////////////////////////////////////////////
844 // CWGSResolver_DL
845 /////////////////////////////////////////////////////////////////////////////
846 
847 
CWGSResolver_DL(void)848 CWGSResolver_DL::CWGSResolver_DL(void)
849     : m_Loader(CObjectManager::GetInstance()->FindDataLoader("GBLOADER"))
850 {
851 
852 }
853 
854 
CWGSResolver_DL(CDataLoader * loader)855 CWGSResolver_DL::CWGSResolver_DL(CDataLoader* loader)
856     : m_Loader(loader)
857 {
858 }
859 
860 
~CWGSResolver_DL(void)861 CWGSResolver_DL::~CWGSResolver_DL(void)
862 {
863 }
864 
865 
866 CRef<CWGSResolver>
CreateResolver(CDataLoader * loader)867 CWGSResolver_DL::CreateResolver(CDataLoader* loader)
868 {
869     if ( !loader ) {
870         return null;
871     }
872     return CRef<CWGSResolver>(new CWGSResolver_DL(loader));
873 }
874 
875 
876 CRef<CWGSResolver>
CreateResolver(void)877 CWGSResolver_DL::CreateResolver(void)
878 {
879     if ( !NCBI_PARAM_TYPE(WGS, RESOLVER_GENBANK)::GetDefault() ) {
880         return null;
881     }
882     CRef<CWGSResolver_DL> resolver(new CWGSResolver_DL());
883     if ( !resolver->IsValid() ) {
884         return null;
885     }
886     return CRef<CWGSResolver>(resolver);
887 }
888 
889 
GetPrefixes(const CSeq_id & id)890 CWGSResolver::TWGSPrefixes CWGSResolver_DL::GetPrefixes(const CSeq_id& id)
891 {
892     TWGSPrefixes prefixes;
893     if ( s_DebugEnabled(eDebug_resolve) ) {
894         LOG_POST_X(10, "CWGSResolver_DL: "
895                    "Asking DataLoader for ids of "<<id.AsFastaString());
896     }
897     CDataLoader::TIds ids;
898     m_Loader->GetIds(CSeq_id_Handle::GetHandle(id), ids);
899     ITERATE ( CDataLoader::TIds, rit, ids ) {
900         if ( s_DebugEnabled(eDebug_resolve) ) {
901             LOG_POST_X(11, "CWGSResolver_DL: Parsing Seq-id "<<*rit);
902         }
903         string prefix = ParseWGSPrefix(*rit->GetSeqId());
904         if ( !prefix.empty() ) {
905             if ( s_DebugEnabled(eDebug_resolve) ) {
906                 LOG_POST_X(12, "CWGSResolver_DL: WGS prefix: "<<prefix);
907             }
908             prefixes.push_back(prefix);
909             break;
910         }
911     }
912     return prefixes;
913 }
914 
915 
916 /////////////////////////////////////////////////////////////////////////////
917 // CWGSResolver_Proc
918 /////////////////////////////////////////////////////////////////////////////
919 
920 
CWGSResolver_Proc(CID2ProcessorResolver * resolver)921 CWGSResolver_Proc::CWGSResolver_Proc(CID2ProcessorResolver* resolver)
922     : m_Resolver(resolver)
923 {
924 }
925 
926 
~CWGSResolver_Proc(void)927 CWGSResolver_Proc::~CWGSResolver_Proc(void)
928 {
929 }
930 
931 
932 CRef<CWGSResolver>
CreateResolver(CID2ProcessorResolver * resolver)933 CWGSResolver_Proc::CreateResolver(CID2ProcessorResolver* resolver)
934 {
935     if ( !resolver ) {
936         return null;
937     }
938     return CRef<CWGSResolver>(new CWGSResolver_Proc(resolver));
939 }
940 
941 
GetPrefixes(const CSeq_id & id)942 CWGSResolver::TWGSPrefixes CWGSResolver_Proc::GetPrefixes(const CSeq_id& id)
943 {
944     TWGSPrefixes prefixes;
945     if ( s_DebugEnabled(eDebug_resolve) ) {
946         LOG_POST_X(13, "CWGSResolver_Proc: "
947                    "Asking GB for ids of "<<id.AsFastaString());
948     }
949     CID2ProcessorResolver::TIds ids = m_Resolver->GetIds(id);
950     ITERATE ( CID2ProcessorResolver::TIds, rit, ids ) {
951         if ( s_DebugEnabled(eDebug_resolve) ) {
952             LOG_POST_X(14, "CWGSResolver_Proc: "
953                        "Parsing Seq-id "<<(*rit)->AsFastaString());
954         }
955         string prefix = ParseWGSPrefix(**rit);
956         if ( !prefix.empty() ) {
957             if ( s_DebugEnabled(eDebug_resolve) ) {
958                 LOG_POST_X(15, "CWGSResolver_Proc: WGS prefix: "<<prefix);
959             }
960             prefixes.push_back(prefix);
961             break;
962         }
963     }
964     return prefixes;
965 }
966 
967 
968 #ifdef WGS_RESOLVER_USE_ID2_CLIENT
969 
970 /////////////////////////////////////////////////////////////////////////////
971 // CWGSResolver_ID2
972 /////////////////////////////////////////////////////////////////////////////
973 
974 
CWGSResolver_ID2(void)975 CWGSResolver_ID2::CWGSResolver_ID2(void)
976     : m_ID2Client(new CID2Client())
977 {
978 }
979 
980 
~CWGSResolver_ID2(void)981 CWGSResolver_ID2::~CWGSResolver_ID2(void)
982 {
983 }
984 
985 
986 CRef<CWGSResolver>
CreateResolver(void)987 CWGSResolver_ID2::CreateResolver(void)
988 {
989     CRef<CWGSResolver_ID2> resolver(new CWGSResolver_ID2);
990     if ( !resolver->IsValid() ) {
991         return null;
992     }
993     return CRef<CWGSResolver>(resolver);
994 }
995 
996 
ParseWGSPrefix(const CID2_Reply & reply) const997 string CWGSResolver_ID2::ParseWGSPrefix(const CID2_Reply& reply) const
998 {
999     if ( !reply.GetReply().IsGet_seq_id() ) {
1000         return string();
1001     }
1002     const CID2_Reply_Get_Seq_id& reply_id = reply.GetReply().GetGet_seq_id();
1003     if ( !reply_id.IsSetSeq_id() ) {
1004         return string();
1005     }
1006     const CID2_Reply_Get_Seq_id::TSeq_id& ids = reply_id.GetSeq_id();
1007     ITERATE ( CID2_Reply_Get_Seq_id::TSeq_id, it, ids ) {
1008         string prefix = CWGSResolver_Ids::ParseWGSPrefix(**it);
1009         if ( !prefix.empty() ) {
1010             return prefix;
1011         }
1012     }
1013     return string();
1014 }
1015 
1016 
Update(void)1017 bool CWGSResolver_ID2::Update(void)
1018 {
1019     CMutexGuard guard(m_Mutex);
1020     bool ret = !m_Cache.empty();
1021     m_Cache.clear();
1022     return ret;
1023 }
1024 
1025 
GetPrefixes(const CSeq_id & id)1026 CWGSResolver::TWGSPrefixes CWGSResolver_ID2::GetPrefixes(const CSeq_id& id)
1027 {
1028     TWGSPrefixes prefixes;
1029     CMutexGuard guard(m_Mutex);
1030     string id_str = id.AsFastaString();
1031     TCache::const_iterator iter = m_Cache.find(id_str);
1032     if ( iter != m_Cache.end() ) {
1033         if ( !iter->second.empty() ) {
1034             prefixes.push_back(iter->second);
1035         }
1036         return prefixes;
1037     }
1038     CID2_Request_Get_Seq_id req;
1039     req.SetSeq_id().SetSeq_id(const_cast<CSeq_id&>(id));
1040     req.SetSeq_id_type(req.eSeq_id_type_general);
1041     if ( s_DebugEnabled(eDebug_resolve) ) {
1042         LOG_POST_X(16, "CWGSResolver_ID2: "
1043                    "Asking ID2 for ids of "<<id.AsFastaString());
1044     }
1045     m_ID2Client->AskGet_seq_id(req);
1046     const CID2Client::TReplies& replies = m_ID2Client->GetAllReplies();
1047     ITERATE ( CID2Client::TReplies, rit, replies ) {
1048         if ( s_DebugEnabled(eDebug_resolve) ) {
1049             LOG_POST_X(17, "CWGSResolver_ID2: "
1050                        "Parsing ID2 reply "<<MSerial_AsnText<<**rit);
1051         }
1052         string prefix = ParseWGSPrefix(**rit);
1053         if ( !prefix.empty() ) {
1054             if ( s_DebugEnabled(eDebug_resolve) ) {
1055                 LOG_POST_X(18, "CWGSResolver_ID2: WGS prefix: "<<prefix);
1056             }
1057             prefixes.push_back(prefix);
1058             break;
1059         }
1060     }
1061     string& save = m_Cache[id_str];
1062     if ( !prefixes.empty() ) {
1063         save = prefixes[0];
1064     }
1065     return prefixes;
1066 }
1067 
1068 #endif //WGS_RESOLVER_USE_ID2_CLIENT
1069 
1070 
1071 END_NAMESPACE(objects);
1072 END_NCBI_NAMESPACE;
1073