1 /* $Id: wgsresolver_impl.cpp 632486 2021-06-02 11:14:23Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Eugene Vasilchenko
27 *
28 * File Description:
29 * Resolve WGS accessions
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/sra/impl/wgsresolver_impl.hpp>
35 #include <sra/readers/ncbi_traces_path.hpp>
36 #include <corelib/ncbifile.hpp>
37 #include <corelib/ncbi_param.hpp>
38 #include <util/line_reader.hpp>
39 #include <sra/error_codes.hpp>
40
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objects/general/Dbtag.hpp>
43 #include <objmgr/object_manager.hpp>
44 #include <objmgr/data_loader.hpp>
45
46 #include <objects/id2/id2processor.hpp>
47
48 #ifdef WGS_RESOLVER_USE_ID2_CLIENT
49 # include <objects/id2/id2__.hpp>
50 # include <objects/id2/id2_client.hpp>
51 #endif
52
53 BEGIN_NCBI_NAMESPACE;
54
55 #define NCBI_USE_ERRCODE_X WGSResolver
56 NCBI_DEFINE_ERR_SUBCODE_X(32);
57
58 BEGIN_NAMESPACE(objects);
59
60
61 #define DEFAULT_WGS_INDEX_ACC "ZZZZ99"
62 #define DEFAULT_WGS_INDEX2_ACC "ZZZZ98"
63 #define DEFAULT_WGS_INDEX3_ACC "ZZZZ97"
64 #define DEFAULT_WGS_INDEX_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ99"
65 #define DEFAULT_WGS_INDEX_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX"
66 #define DEFAULT_WGS_INDEX2_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ98"
67 #define DEFAULT_WGS_INDEX2_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX_V2"
68 #define DEFAULT_WGS_INDEX3_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ97"
69 #define DEFAULT_WGS_INDEX3_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_INDEX_V3"
70
71 #define DEFAULT_WGS_RANGE_INDEX_ACC "ZZZZ79"
72 #define DEFAULT_WGS_RANGE_INDEX2_ACC "ZZZZ78"
73 #define DEFAULT_WGS_RANGE_INDEX_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ79"
74 #define DEFAULT_WGS_RANGE_INDEX_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_RANGE_INDEX_1"
75 #define DEFAULT_WGS_RANGE_INDEX2_PATH1 NCBI_TRACES04_PATH "/wgs03/WGS/ZZ/ZZ/ZZZZ78"
76 #define DEFAULT_WGS_RANGE_INDEX2_PATH2 NCBI_TRACES04_PATH "/wgs03/WGS/WGS_RANGE_INDEX_2"
77
78
79 NCBI_PARAM_DECL(bool, WGS, RESOLVER_DIRECT_WGS_INDEX);
80 NCBI_PARAM_DEF(bool, WGS, RESOLVER_DIRECT_WGS_INDEX, true);
81
82 NCBI_PARAM_DECL(bool, WGS, RESOLVER_GENBANK);
83 NCBI_PARAM_DEF(bool, WGS, RESOLVER_GENBANK, true);
84
85 NCBI_PARAM_DECL(bool, WGS, RESOLVER_WGS_RANGE_INDEX);
86 NCBI_PARAM_DEF(bool, WGS, RESOLVER_WGS_RANGE_INDEX, true);
87
s_UseWGSRangeIndex(void)88 static inline bool s_UseWGSRangeIndex(void)
89 {
90 static bool value = NCBI_PARAM_TYPE(WGS, RESOLVER_WGS_RANGE_INDEX)::GetDefault();
91 return value;
92 }
93
94 NCBI_PARAM_DECL(string, WGS, WGS_INDEX);
95 NCBI_PARAM_DEF(string, WGS, WGS_INDEX, "");
96
97
98 NCBI_PARAM_DECL(string, WGS, WGS_INDEX2);
99 NCBI_PARAM_DEF(string, WGS, WGS_INDEX2, "");
100
101
102 NCBI_PARAM_DECL(string, WGS, WGS_INDEX3);
103 NCBI_PARAM_DEF(string, WGS, WGS_INDEX3, "");
104
105
106 NCBI_PARAM_DECL(string, WGS, WGS_INDEX_ACC);
107 NCBI_PARAM_DEF(string, WGS, WGS_INDEX_ACC, DEFAULT_WGS_INDEX_ACC);
108
109
110 NCBI_PARAM_DECL(string, WGS, WGS_INDEX2_ACC);
111 NCBI_PARAM_DEF(string, WGS, WGS_INDEX2_ACC, DEFAULT_WGS_INDEX2_ACC);
112
113
114 NCBI_PARAM_DECL(string, WGS, WGS_INDEX3_ACC);
115 NCBI_PARAM_DEF(string, WGS, WGS_INDEX3_ACC, DEFAULT_WGS_INDEX3_ACC);
116
117
118 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX);
119 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX, "");
120
121
122 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX2);
123 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX2, "");
124
125
126 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX_ACC);
127 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX_ACC, DEFAULT_WGS_RANGE_INDEX_ACC);
128
129
130 NCBI_PARAM_DECL(string, WGS, WGS_RANGE_INDEX2_ACC);
131 NCBI_PARAM_DEF(string, WGS, WGS_RANGE_INDEX2_ACC, DEFAULT_WGS_RANGE_INDEX2_ACC);
132
133
134 // debug levels
135 enum EDebugLevel {
136 eDebug_none = 0,
137 eDebug_error = 1,
138 eDebug_open = 2,
139 eDebug_request = 5,
140 eDebug_replies = 6,
141 eDebug_resolve = 7,
142 eDebug_data = 8,
143 eDebug_all = 9
144 };
145
146 NCBI_PARAM_DECL(int, WGS, DEBUG_RESOLVE);
147 NCBI_PARAM_DEF_EX(int, WGS, DEBUG_RESOLVE, eDebug_error,
148 eParam_NoThread, WGS_DEBUG_RESOLVE);
149
s_DebugLevel(void)150 static inline int s_DebugLevel(void)
151 {
152 static CSafeStatic<NCBI_PARAM_TYPE(WGS, DEBUG_RESOLVE)> s_Value;
153 return s_Value->Get();
154 }
155
156
s_DebugEnabled(EDebugLevel level)157 static inline bool s_DebugEnabled(EDebugLevel level)
158 {
159 return s_DebugLevel() >= level;
160 }
161
162
163 //#define COLLECT_PROFILE
164 #ifdef COLLECT_PROFILE
165 struct SProfiler
166 {
167 const char* name;
168 size_t count;
169 CStopWatch sw;
SProfilerSProfiler170 SProfiler() : name(0), count(0) {}
~SProfilerSProfiler171 ~SProfiler() {
172 if ( name )
173 cout << name<<" calls: "<<count<<" time: "<<sw.Elapsed()<<endl;
174 }
175 };
176 struct SProfilerGuard
177 {
178 SProfiler& sw;
SProfilerGuardSProfilerGuard179 SProfilerGuard(SProfiler& sw, const char* name)
180 : sw(sw)
181 {
182 sw.name = name;
183 sw.count += 1;
184 sw.sw.Start();
185 }
~SProfilerGuardSProfilerGuard186 ~SProfilerGuard()
187 {
188 sw.sw.Stop();
189 }
190 };
191
192 static SProfiler sw_AccFind;
193 static SProfiler sw_AccRange;
194 static SProfiler sw_WGSPrefix;
195
196 # define PROFILE(var) SProfilerGuard guard(var, #var)
197 #else
198 # define PROFILE(var)
199 #endif
200
201 /////////////////////////////////////////////////////////////////////////////
202 // CWGSResolver_VDB
203 /////////////////////////////////////////////////////////////////////////////
204
205
206 // SGiIdxTableCursor is helper accessor structure for optional GI_IDX table
207 struct CWGSResolver_VDB::SGiIdxTableCursor : public CObject {
208 explicit SGiIdxTableCursor(const CVDBTable& table);
209
210 CVDBTable m_Table;
211 CVDBCursor m_Cursor;
212
213 DECLARE_VDB_COLUMN_AS_STRING(WGS_PREFIX);
214 };
215
216
SGiIdxTableCursor(const CVDBTable & table)217 CWGSResolver_VDB::SGiIdxTableCursor::SGiIdxTableCursor(const CVDBTable& table)
218 : m_Table(table),
219 m_Cursor(table),
220 INIT_VDB_COLUMN(WGS_PREFIX)
221 {
222 }
223
224
225 // SAccIdxTableCursor is helper accessor structure for optional ACC_IDX table
226 struct CWGSResolver_VDB::SAccIdxTableCursor : public CObject {
227 explicit SAccIdxTableCursor(const CVDBTable& table);
228
229 CVDBTable m_Table;
230 CVDBCursor m_Cursor;
231
232 typedef Uint2 acc_range_number_t;
233 DECLARE_VDB_COLUMN_AS(acc_range_number_t, ACCESSION_RANGE);
234 DECLARE_VDB_COLUMN_AS_STRING(WGS_PREFIX);
235 };
236
237
SAccIdxTableCursor(const CVDBTable & table)238 CWGSResolver_VDB::SAccIdxTableCursor::SAccIdxTableCursor(const CVDBTable& table)
239 : m_Table(table),
240 m_Cursor(table),
241 INIT_OPTIONAL_VDB_COLUMN(ACCESSION_RANGE),
242 INIT_VDB_COLUMN(WGS_PREFIX)
243 {
244 }
245
246
GetDefaultWGSIndexPath(EIndexType index_type)247 string CWGSResolver_VDB::GetDefaultWGSIndexPath(EIndexType index_type)
248 {
249 if ( s_UseWGSRangeIndex() ) {
250 if ( index_type == eMainIndex ) {
251 return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX)::GetDefault();
252 }
253 else if ( index_type == eSecondIndex ) {
254 return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX2)::GetDefault();
255 }
256 }
257 else {
258 if ( index_type == eMainIndex ) {
259 return NCBI_PARAM_TYPE(WGS, WGS_INDEX)::GetDefault();
260 }
261 else if ( index_type == eSecondIndex ) {
262 return NCBI_PARAM_TYPE(WGS, WGS_INDEX2)::GetDefault();
263 }
264 else if ( index_type == eThirdIndex ) {
265 return NCBI_PARAM_TYPE(WGS, WGS_INDEX3)::GetDefault();
266 }
267 }
268 return string();
269 }
270
271
GetDefaultWGSIndexAcc(EIndexType index_type)272 string CWGSResolver_VDB::GetDefaultWGSIndexAcc(EIndexType index_type)
273 {
274 if ( s_UseWGSRangeIndex() ) {
275 if ( index_type == eMainIndex ) {
276 return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX_ACC)::GetDefault();
277 }
278 else if ( index_type == eSecondIndex ) {
279 return NCBI_PARAM_TYPE(WGS, WGS_RANGE_INDEX2_ACC)::GetDefault();
280 }
281 }
282 else {
283 if ( index_type == eMainIndex ) {
284 return NCBI_PARAM_TYPE(WGS, WGS_INDEX_ACC)::GetDefault();
285 }
286 else if ( index_type == eSecondIndex ) {
287 return NCBI_PARAM_TYPE(WGS, WGS_INDEX2_ACC)::GetDefault();
288 }
289 else if ( index_type == eThirdIndex ) {
290 return NCBI_PARAM_TYPE(WGS, WGS_INDEX3_ACC)::GetDefault();
291 }
292 }
293 return string();
294 }
295
296
297 static
GetDirectWGSIndexPath(CWGSResolver_VDB::EIndexType index_type)298 string GetDirectWGSIndexPath(CWGSResolver_VDB::EIndexType index_type)
299 {
300 string path;
301 if ( NCBI_PARAM_TYPE(WGS, RESOLVER_DIRECT_WGS_INDEX)::GetDefault() ) {
302 const char* path1 = 0;
303 const char* path2 = 0;
304 if ( s_UseWGSRangeIndex() ) {
305 if ( index_type == CWGSResolver_VDB::eMainIndex ) {
306 path1 = DEFAULT_WGS_RANGE_INDEX_PATH1;
307 path2 = DEFAULT_WGS_RANGE_INDEX_PATH2;
308 }
309 else if ( index_type == CWGSResolver_VDB::eSecondIndex ) {
310 path1 = DEFAULT_WGS_RANGE_INDEX2_PATH1;
311 path2 = DEFAULT_WGS_RANGE_INDEX2_PATH2;
312 }
313 }
314 else {
315 if ( index_type == CWGSResolver_VDB::eMainIndex ) {
316 path1 = DEFAULT_WGS_INDEX_PATH1;
317 path2 = DEFAULT_WGS_INDEX_PATH2;
318 }
319 else if ( index_type == CWGSResolver_VDB::eSecondIndex ) {
320 path1 = DEFAULT_WGS_INDEX2_PATH1;
321 path2 = DEFAULT_WGS_INDEX2_PATH2;
322 }
323 else if ( index_type == CWGSResolver_VDB::eThirdIndex ) {
324 path1 = DEFAULT_WGS_INDEX3_PATH1;
325 path2 = DEFAULT_WGS_INDEX3_PATH2;
326 }
327 }
328 if ( path1 && CDirEntry(path1).Exists() ) {
329 path = path1;
330 }
331 else if ( path2 && CDirEntry(path2).Exists() ) {
332 path = path2;
333 }
334 }
335 return path;
336 }
337
338
CWGSResolver_VDB(const CVDBMgr & mgr,EIndexType index_type,CWGSResolver_VDB * next_resolver)339 CWGSResolver_VDB::CWGSResolver_VDB(const CVDBMgr& mgr,
340 EIndexType index_type,
341 CWGSResolver_VDB* next_resolver)
342 : m_NextResolver(next_resolver)
343 {
344 string path = GetDefaultWGSIndexPath(index_type);
345 if ( path.empty() ) {
346 string acc = GetDefaultWGSIndexAcc(index_type);
347 // no user-defined index path, try default locations
348 // first try to open index by predefined accession, maybe remotely
349 Open(mgr, acc);
350 if ( IsValid() ) {
351 // opened
352 return;
353 }
354 // then try to open index by direct file acces, only locally
355 path = GetDirectWGSIndexPath(index_type);
356 if ( path.empty() ) {
357 // VDB index is not available
358 return;
359 }
360 }
361 if ( path.find_first_of("\\/") != NPOS && !CDirEntry(path).Exists() ) {
362 // not an accession (has directory separators) and not a file
363 if ( s_DebugEnabled(eDebug_error) ) {
364 ERR_POST_X(9, "CWGSResolver_VDB: cannot find index file: "<<path);
365 }
366 return;
367 }
368 Open(mgr, path);
369 }
370
371
CWGSResolver_VDB(const CVDBMgr & mgr,const string & path,CWGSResolver_VDB * next_resolver)372 CWGSResolver_VDB::CWGSResolver_VDB(const CVDBMgr& mgr,
373 const string& path,
374 CWGSResolver_VDB* next_resolver)
375 : m_NextResolver(next_resolver)
376 {
377 Open(mgr, path);
378 }
379
380
~CWGSResolver_VDB(void)381 CWGSResolver_VDB::~CWGSResolver_VDB(void)
382 {
383 Close();
384 }
385
386
CreateResolver(const CVDBMgr & mgr)387 CRef<CWGSResolver> CWGSResolver_VDB::CreateResolver(const CVDBMgr& mgr)
388 {
389 CRef<CWGSResolver_VDB> ret(new CWGSResolver_VDB(mgr, eMainIndex));
390 if ( !ret->IsValid() ) {
391 return null;
392 }
393 CRef<CWGSResolver_VDB> ret2(new CWGSResolver_VDB(mgr, eSecondIndex, ret));
394 if ( ret2->IsValid() ) {
395 ret = ret2;
396 }
397 if ( !ret->m_AccIndexIsPrefix ) {
398 CRef<CWGSResolver_VDB> ret3(new CWGSResolver_VDB(mgr, eThirdIndex, ret));
399 if ( ret3->IsValid() ) {
400 ret = ret3;
401 }
402 }
403 return CRef<CWGSResolver>(ret);
404 }
405
406
Close(void)407 void CWGSResolver_VDB::Close(void)
408 {
409 TDBMutex::TWriteLockGuard guard(m_DBMutex);
410 x_Close();
411 }
412
413
x_Close()414 void CWGSResolver_VDB::x_Close()
415 {
416 m_Mgr.Close();
417 m_Db.Close();
418 m_GiIdxTable.Close();
419 m_AccIdxTable.Close();
420 m_AccIndex.Close();
421 m_GiIdxCursorCache.Clear();
422 m_AccIdxCursorCache.Clear();
423 }
424
425
s_ResolveAccOrPath(const CVDBMgr & mgr,const string & acc_or_path)426 static string s_ResolveAccOrPath(const CVDBMgr& mgr, const string& acc_or_path)
427 {
428 string path;
429 if ( CVPath::IsPlainAccession(acc_or_path) ) {
430 // resolve VDB accessions
431 try {
432 path = mgr.FindAccPath(acc_or_path);
433 if ( s_DebugEnabled(eDebug_open) ) {
434 LOG_POST_X(28, "CWGSResolver_VDB("<<acc_or_path<<"): -> "<<path);
435 }
436 }
437 catch ( CSraException& /*ignored*/ ) {
438 path = acc_or_path;
439 }
440 }
441 else {
442 // real path, http:, etc.
443 path = acc_or_path;
444 }
445
446 // resolve symbolic links for correct timestamp and longer-living reference
447 CDirEntry de(path);
448 if ( de.Exists() ) {
449 de.DereferenceLink();
450 if ( de.GetPath() != path ) {
451 path = de.GetPath();
452 if ( s_DebugEnabled(eDebug_open) ) {
453 LOG_POST_X(29, "CWGSResolver_VDB("<<acc_or_path<<"): "
454 "resolved index link to "<<path);
455 }
456 }
457 }
458 return path;
459 }
460
461
Open(const CVDBMgr & mgr,const string & acc_or_path)462 void CWGSResolver_VDB::Open(const CVDBMgr& mgr, const string& acc_or_path)
463 {
464 string path = s_ResolveAccOrPath(mgr, acc_or_path);
465
466 // open VDB file
467 TDBMutex::TWriteLockGuard guard(m_DBMutex);
468 x_Close();
469 m_Mgr = mgr;
470 try {
471 m_Db = CVDB(m_Mgr, path);
472 }
473 catch ( CSraException& exc ) {
474 if ( exc.GetErrCode() == exc.eNotFoundDb ) {
475 return;
476 }
477 throw;
478 }
479
480 // save original argument for possible changes in symbolic links
481 m_WGSIndexPath = acc_or_path;
482 m_WGSIndexResolvedPath = path;
483 if ( !CDirEntry(path).GetTime(&m_Timestamp) ) {
484 m_Timestamp = CTime();
485 }
486 else {
487 if ( s_DebugEnabled(eDebug_open) ) {
488 LOG_POST_X(30, "CWGSResolver_VDB("<<acc_or_path<<"): index timestamp: "<<m_Timestamp);
489 }
490 }
491 m_GiIdxTable = CVDBTable(m_Db, "GI_IDX");
492 m_AccIdxTable = CVDBTable(m_Db, "ACC_IDX");
493 m_AccIndexIsPrefix = true;
494 m_AccIndex = CVDBTableIndex(m_AccIdxTable, "accession_prefix", CVDBTableIndex::eMissing_Allow);
495 if ( !m_AccIndex ) {
496 m_AccIndexIsPrefix = false;
497 m_AccIndex = CVDBTableIndex(m_AccIdxTable, "accession");
498 }
499 }
500
501
Reopen(void)502 void CWGSResolver_VDB::Reopen(void)
503 {
504 if ( CVDBMgr mgr = m_Mgr ) {
505 string path = GetWGSIndexPath();
506 Open(mgr, path);
507 }
508 }
509
510
Update(void)511 bool CWGSResolver_VDB::Update(void)
512 {
513 bool ret = x_Update();
514 if ( m_NextResolver && m_NextResolver->Update() ) {
515 ret = true;
516 }
517 return ret;
518 }
519
520
x_Update(void)521 bool CWGSResolver_VDB::x_Update(void)
522 {
523 string path = s_ResolveAccOrPath(m_Mgr, GetWGSIndexPath());
524 if ( path != GetWGSIndexResolvedPath() ) {
525 // resolved to a different path -> new index by symbolic link
526 LOG_POST_X(32, "CWGSResolver_VDB: new index path: "<<path);
527 Reopen();
528 return true;
529 }
530
531 CTime timestamp;
532 if ( !CDirEntry(path).GetTime(×tamp) ) {
533 // cannot get timestamp -> remote reference
534 return false;
535 }
536 if ( timestamp == m_Timestamp ) {
537 // same timestamp
538 return false;
539 }
540 if ( s_DebugEnabled(eDebug_open) ) {
541 LOG_POST_X(31, "CWGSResolver_VDB: new index timestamp: "<<timestamp);
542 }
543 Reopen();
544 return true;
545 }
546
547
548 inline
GiIdx(TIntId row)549 CRef<CWGSResolver_VDB::SGiIdxTableCursor> CWGSResolver_VDB::GiIdx(TIntId row)
550 {
551 CRef<SGiIdxTableCursor> curs = m_GiIdxCursorCache.Get(row);
552 if ( !curs ) {
553 curs = new SGiIdxTableCursor(GiIdxTable());
554 }
555 return curs;
556 }
557
558
559 inline
AccIdx(void)560 CRef<CWGSResolver_VDB::SAccIdxTableCursor> CWGSResolver_VDB::AccIdx(void)
561 {
562 CRef<SAccIdxTableCursor> curs = m_AccIdxCursorCache.Get();
563 if ( !curs ) {
564 curs = new SAccIdxTableCursor(AccIdxTable());
565 }
566 return curs;
567 }
568
569
570 inline
Put(CRef<SGiIdxTableCursor> & curs,TIntId row)571 void CWGSResolver_VDB::Put(CRef<SGiIdxTableCursor>& curs, TIntId row)
572 {
573 if ( curs->m_Table == GiIdxTable() ) {
574 m_GiIdxCursorCache.Put(curs, row);
575 }
576 }
577
578
579 inline
Put(CRef<SAccIdxTableCursor> & curs)580 void CWGSResolver_VDB::Put(CRef<SAccIdxTableCursor>& curs)
581 {
582 if ( curs->m_Table == AccIdxTable() ) {
583 m_AccIdxCursorCache.Put(curs);
584 }
585 }
586
587
GetPrefixes(TGi gi)588 CWGSResolver::TWGSPrefixes CWGSResolver_VDB::GetPrefixes(TGi gi)
589 {
590 TDBMutex::TReadLockGuard guard(m_DBMutex);
591 TWGSPrefixes ret;
592 if ( s_DebugEnabled(eDebug_resolve) ) {
593 LOG_POST_X(24, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): Resolving "<<gi);
594 }
595 CRef<SGiIdxTableCursor> cur = GiIdx();
596 CVDBStringValue value = cur->WGS_PREFIX(GI_TO(TVDBRowId, gi), CVDBValue::eMissing_Allow);
597 if ( !value.empty() ) {
598 if ( s_DebugEnabled(eDebug_resolve) ) {
599 LOG_POST_X(25, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): WGS prefix "<<*value);
600 }
601 ret.push_back(*value);
602 }
603 Put(cur);
604 if ( ret.empty() && m_NextResolver ) {
605 ret = m_NextResolver->GetPrefixes(gi);
606 }
607 return ret;
608 }
609
610
s_SplitAccIndex(string & uacc,Uint2 & key_num)611 static inline bool s_SplitAccIndex(string& uacc, Uint2& key_num)
612 {
613 size_t acc_len = uacc.size();
614 if ( acc_len <= 4 ) {
615 return false;
616 }
617 size_t prefix_len = acc_len-4;
618 unsigned v = 0;
619 for ( int i = 0; i < 4; ++i ) {
620 char c = uacc[prefix_len+i];
621 if ( c < '0' || c > '9' ) {
622 return false;
623 }
624 v = v*10 + (c-'0');
625 }
626 key_num = v;
627 uacc.erase(prefix_len);
628 return true;
629 }
630
631
GetPrefixes(const string & acc)632 CWGSResolver::TWGSPrefixes CWGSResolver_VDB::GetPrefixes(const string& acc)
633 {
634 TDBMutex::TReadLockGuard guard(m_DBMutex);
635 TWGSPrefixes ret;
636 if ( s_DebugEnabled(eDebug_resolve) ) {
637 LOG_POST_X(26, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): Resolving "<<acc);
638 }
639 string uacc = acc;
640 SAccIdxTableCursor::acc_range_number_t key_num = 0;
641 if ( m_AccIndexIsPrefix ) {
642 if ( !s_SplitAccIndex(uacc, key_num) ) {
643 if ( s_DebugEnabled(eDebug_resolve) ) {
644 LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): invalid accession");
645 }
646 return ret;
647 }
648 }
649 NStr::ToUpper(uacc);
650 TVDBRowIdRange range;
651 {{
652 PROFILE(sw_AccFind);
653 range = m_AccIndex.Find(uacc);
654 }}
655 if ( s_DebugEnabled(eDebug_resolve) ) {
656 LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): "
657 "range "<<range.first<<"-"<<range.second);
658 }
659 if ( range.second ) {
660 CRef<SAccIdxTableCursor> cur = AccIdx();
661 for ( TVDBRowCount i = 0; i < range.second; ++i ) {
662 TVDBRowId row_id = range.first+i;
663 if ( m_AccIndexIsPrefix ) {
664 PROFILE(sw_AccRange);
665 CVDBValueFor<SAccIdxTableCursor::acc_range_number_t> v =
666 cur->ACCESSION_RANGE(row_id);
667 if ( v[0] > key_num ) {
668 // current range is past the requested id, end of scan
669 break;
670 }
671 if ( v[1] < key_num ) {
672 // current range is before the requested id, check next range
673 continue;
674 }
675 }
676 PROFILE(sw_WGSPrefix);
677 CTempString prefix = *cur->WGS_PREFIX(row_id);
678 if ( s_DebugEnabled(eDebug_resolve) ) {
679 LOG_POST_X(27, "CWGSResolver_VDB("<<GetWGSIndexPath()<<"): WGS prefix "<<prefix);
680 }
681 ret.push_back(prefix);
682 }
683 Put(cur);
684 }
685 if ( ret.empty() && m_NextResolver ) {
686 ret = m_NextResolver->GetPrefixes(acc);
687 }
688 return ret;
689 }
690
691
692 /////////////////////////////////////////////////////////////////////////////
693 // CWGSResolver_Ids
694 /////////////////////////////////////////////////////////////////////////////
695
696
CWGSResolver_Ids(void)697 CWGSResolver_Ids::CWGSResolver_Ids(void)
698 {
699 }
700
701
~CWGSResolver_Ids(void)702 CWGSResolver_Ids::~CWGSResolver_Ids(void)
703 {
704 }
705
706
ParseWGSPrefix(const CDbtag & dbtag) const707 string CWGSResolver_Ids::ParseWGSPrefix(const CDbtag& dbtag) const
708 {
709 const string& db = dbtag.GetDb();
710 if ( (db.size() != 8 && db.size() != 10) ||
711 !NStr::StartsWith(db, "WGS:") ) {
712 return string();
713 }
714 string prefix = db.substr(4);
715 if ( prefix.size() == 4 ) {
716 prefix += "01";
717 }
718 _ASSERT(prefix.size() == 6);
719 for ( size_t i = 0; i < 4; ++i ) {
720 if ( !isupper(Uint1(prefix[i])) ) {
721 return string();
722 }
723 }
724 for ( size_t i = 4; i < 6; ++i ) {
725 if ( !isdigit(Uint1(prefix[i])) ) {
726 return string();
727 }
728 }
729 return prefix;
730 }
731
732
733 static const size_t kNumLetters = 4;
734 static const size_t kVersionDigits = 2;
735 static const size_t kPrefixLen = kNumLetters + kVersionDigits;
736 static const size_t kMinRowDigits = 6;
737 static const size_t kMaxRowDigits = 8;
738
739
ParseWGSAcc(const string & acc,bool protein) const740 string CWGSResolver_Ids::ParseWGSAcc(const string& acc, bool protein) const
741 {
742 if ( acc.size() < kPrefixLen + kMinRowDigits ||
743 acc.size() > kPrefixLen + kMaxRowDigits + 1 ) { // one for type letter
744 return string();
745 }
746 for ( size_t i = 0; i < kNumLetters; ++i ) {
747 if ( !isalpha(acc[i]&0xff) ) {
748 return string();
749 }
750 }
751 for ( size_t i = kNumLetters; i < kPrefixLen; ++i ) {
752 if ( !isdigit(acc[i]&0xff) ) {
753 return string();
754 }
755 }
756 SIZE_TYPE row_pos = kPrefixLen;
757 switch ( acc[row_pos] ) { // optional type letter
758 case 'S':
759 if ( protein ) {
760 return string();
761 }
762 ++row_pos;
763 break;
764 case 'P':
765 if ( !protein ) {
766 return string();
767 }
768 ++row_pos;
769 break;
770 default:
771 // it can be either contig or master sequence
772 if ( protein ) {
773 return string();
774 }
775 break;
776 }
777 for ( size_t i = row_pos; i < acc.size(); ++i ) {
778 char c = acc[i];
779 if ( c < '0' || c > '9' ) {
780 return string();
781 }
782 }
783 return acc.substr(0, kPrefixLen);
784 }
785
786
ParseWGSPrefix(const CTextseq_id & text_id) const787 string CWGSResolver_Ids::ParseWGSPrefix(const CTextseq_id& text_id) const
788 {
789 if ( text_id.IsSetName() ) {
790 // first try name reference if it has WGS format like AAAA01P000001
791 // as it directly contains WGS accession
792 string wgs_acc = ParseWGSAcc(text_id.GetName(), true);
793 if ( !wgs_acc.empty() ) {
794 return wgs_acc;
795 }
796 }
797 if ( text_id.IsSetAccession() ) {
798 const string& acc = text_id.GetAccession();
799 CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc);
800 if ( !(type & CSeq_id::fAcc_prot) ) {
801 switch ( type & CSeq_id::eAcc_division_mask ) {
802 // accepted accession types
803 case CSeq_id::eAcc_wgs:
804 case CSeq_id::eAcc_wgs_intermed:
805 case CSeq_id::eAcc_tsa:
806 return ParseWGSAcc(acc, false);
807 default:
808 break;
809 }
810 }
811 }
812 return string();
813 }
814
815
ParseWGSPrefix(const CSeq_id & id) const816 string CWGSResolver_Ids::ParseWGSPrefix(const CSeq_id& id) const
817 {
818 if ( id.IsGeneral() ) {
819 return ParseWGSPrefix(id.GetGeneral());
820 }
821 else if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
822 return ParseWGSPrefix(*text_id);
823 }
824 return string();
825 }
826
827
GetPrefixes(TGi gi)828 CWGSResolver::TWGSPrefixes CWGSResolver_Ids::GetPrefixes(TGi gi)
829 {
830 CSeq_id seq_id;
831 seq_id.SetGi(gi);
832 return GetPrefixes(seq_id);
833 }
834
835
GetPrefixes(const string & acc)836 CWGSResolver::TWGSPrefixes CWGSResolver_Ids::GetPrefixes(const string& acc)
837 {
838 CSeq_id seq_id(acc);
839 return GetPrefixes(seq_id);
840 }
841
842
843 /////////////////////////////////////////////////////////////////////////////
844 // CWGSResolver_DL
845 /////////////////////////////////////////////////////////////////////////////
846
847
CWGSResolver_DL(void)848 CWGSResolver_DL::CWGSResolver_DL(void)
849 : m_Loader(CObjectManager::GetInstance()->FindDataLoader("GBLOADER"))
850 {
851
852 }
853
854
CWGSResolver_DL(CDataLoader * loader)855 CWGSResolver_DL::CWGSResolver_DL(CDataLoader* loader)
856 : m_Loader(loader)
857 {
858 }
859
860
~CWGSResolver_DL(void)861 CWGSResolver_DL::~CWGSResolver_DL(void)
862 {
863 }
864
865
866 CRef<CWGSResolver>
CreateResolver(CDataLoader * loader)867 CWGSResolver_DL::CreateResolver(CDataLoader* loader)
868 {
869 if ( !loader ) {
870 return null;
871 }
872 return CRef<CWGSResolver>(new CWGSResolver_DL(loader));
873 }
874
875
876 CRef<CWGSResolver>
CreateResolver(void)877 CWGSResolver_DL::CreateResolver(void)
878 {
879 if ( !NCBI_PARAM_TYPE(WGS, RESOLVER_GENBANK)::GetDefault() ) {
880 return null;
881 }
882 CRef<CWGSResolver_DL> resolver(new CWGSResolver_DL());
883 if ( !resolver->IsValid() ) {
884 return null;
885 }
886 return CRef<CWGSResolver>(resolver);
887 }
888
889
GetPrefixes(const CSeq_id & id)890 CWGSResolver::TWGSPrefixes CWGSResolver_DL::GetPrefixes(const CSeq_id& id)
891 {
892 TWGSPrefixes prefixes;
893 if ( s_DebugEnabled(eDebug_resolve) ) {
894 LOG_POST_X(10, "CWGSResolver_DL: "
895 "Asking DataLoader for ids of "<<id.AsFastaString());
896 }
897 CDataLoader::TIds ids;
898 m_Loader->GetIds(CSeq_id_Handle::GetHandle(id), ids);
899 ITERATE ( CDataLoader::TIds, rit, ids ) {
900 if ( s_DebugEnabled(eDebug_resolve) ) {
901 LOG_POST_X(11, "CWGSResolver_DL: Parsing Seq-id "<<*rit);
902 }
903 string prefix = ParseWGSPrefix(*rit->GetSeqId());
904 if ( !prefix.empty() ) {
905 if ( s_DebugEnabled(eDebug_resolve) ) {
906 LOG_POST_X(12, "CWGSResolver_DL: WGS prefix: "<<prefix);
907 }
908 prefixes.push_back(prefix);
909 break;
910 }
911 }
912 return prefixes;
913 }
914
915
916 /////////////////////////////////////////////////////////////////////////////
917 // CWGSResolver_Proc
918 /////////////////////////////////////////////////////////////////////////////
919
920
CWGSResolver_Proc(CID2ProcessorResolver * resolver)921 CWGSResolver_Proc::CWGSResolver_Proc(CID2ProcessorResolver* resolver)
922 : m_Resolver(resolver)
923 {
924 }
925
926
~CWGSResolver_Proc(void)927 CWGSResolver_Proc::~CWGSResolver_Proc(void)
928 {
929 }
930
931
932 CRef<CWGSResolver>
CreateResolver(CID2ProcessorResolver * resolver)933 CWGSResolver_Proc::CreateResolver(CID2ProcessorResolver* resolver)
934 {
935 if ( !resolver ) {
936 return null;
937 }
938 return CRef<CWGSResolver>(new CWGSResolver_Proc(resolver));
939 }
940
941
GetPrefixes(const CSeq_id & id)942 CWGSResolver::TWGSPrefixes CWGSResolver_Proc::GetPrefixes(const CSeq_id& id)
943 {
944 TWGSPrefixes prefixes;
945 if ( s_DebugEnabled(eDebug_resolve) ) {
946 LOG_POST_X(13, "CWGSResolver_Proc: "
947 "Asking GB for ids of "<<id.AsFastaString());
948 }
949 CID2ProcessorResolver::TIds ids = m_Resolver->GetIds(id);
950 ITERATE ( CID2ProcessorResolver::TIds, rit, ids ) {
951 if ( s_DebugEnabled(eDebug_resolve) ) {
952 LOG_POST_X(14, "CWGSResolver_Proc: "
953 "Parsing Seq-id "<<(*rit)->AsFastaString());
954 }
955 string prefix = ParseWGSPrefix(**rit);
956 if ( !prefix.empty() ) {
957 if ( s_DebugEnabled(eDebug_resolve) ) {
958 LOG_POST_X(15, "CWGSResolver_Proc: WGS prefix: "<<prefix);
959 }
960 prefixes.push_back(prefix);
961 break;
962 }
963 }
964 return prefixes;
965 }
966
967
968 #ifdef WGS_RESOLVER_USE_ID2_CLIENT
969
970 /////////////////////////////////////////////////////////////////////////////
971 // CWGSResolver_ID2
972 /////////////////////////////////////////////////////////////////////////////
973
974
CWGSResolver_ID2(void)975 CWGSResolver_ID2::CWGSResolver_ID2(void)
976 : m_ID2Client(new CID2Client())
977 {
978 }
979
980
~CWGSResolver_ID2(void)981 CWGSResolver_ID2::~CWGSResolver_ID2(void)
982 {
983 }
984
985
986 CRef<CWGSResolver>
CreateResolver(void)987 CWGSResolver_ID2::CreateResolver(void)
988 {
989 CRef<CWGSResolver_ID2> resolver(new CWGSResolver_ID2);
990 if ( !resolver->IsValid() ) {
991 return null;
992 }
993 return CRef<CWGSResolver>(resolver);
994 }
995
996
ParseWGSPrefix(const CID2_Reply & reply) const997 string CWGSResolver_ID2::ParseWGSPrefix(const CID2_Reply& reply) const
998 {
999 if ( !reply.GetReply().IsGet_seq_id() ) {
1000 return string();
1001 }
1002 const CID2_Reply_Get_Seq_id& reply_id = reply.GetReply().GetGet_seq_id();
1003 if ( !reply_id.IsSetSeq_id() ) {
1004 return string();
1005 }
1006 const CID2_Reply_Get_Seq_id::TSeq_id& ids = reply_id.GetSeq_id();
1007 ITERATE ( CID2_Reply_Get_Seq_id::TSeq_id, it, ids ) {
1008 string prefix = CWGSResolver_Ids::ParseWGSPrefix(**it);
1009 if ( !prefix.empty() ) {
1010 return prefix;
1011 }
1012 }
1013 return string();
1014 }
1015
1016
Update(void)1017 bool CWGSResolver_ID2::Update(void)
1018 {
1019 CMutexGuard guard(m_Mutex);
1020 bool ret = !m_Cache.empty();
1021 m_Cache.clear();
1022 return ret;
1023 }
1024
1025
GetPrefixes(const CSeq_id & id)1026 CWGSResolver::TWGSPrefixes CWGSResolver_ID2::GetPrefixes(const CSeq_id& id)
1027 {
1028 TWGSPrefixes prefixes;
1029 CMutexGuard guard(m_Mutex);
1030 string id_str = id.AsFastaString();
1031 TCache::const_iterator iter = m_Cache.find(id_str);
1032 if ( iter != m_Cache.end() ) {
1033 if ( !iter->second.empty() ) {
1034 prefixes.push_back(iter->second);
1035 }
1036 return prefixes;
1037 }
1038 CID2_Request_Get_Seq_id req;
1039 req.SetSeq_id().SetSeq_id(const_cast<CSeq_id&>(id));
1040 req.SetSeq_id_type(req.eSeq_id_type_general);
1041 if ( s_DebugEnabled(eDebug_resolve) ) {
1042 LOG_POST_X(16, "CWGSResolver_ID2: "
1043 "Asking ID2 for ids of "<<id.AsFastaString());
1044 }
1045 m_ID2Client->AskGet_seq_id(req);
1046 const CID2Client::TReplies& replies = m_ID2Client->GetAllReplies();
1047 ITERATE ( CID2Client::TReplies, rit, replies ) {
1048 if ( s_DebugEnabled(eDebug_resolve) ) {
1049 LOG_POST_X(17, "CWGSResolver_ID2: "
1050 "Parsing ID2 reply "<<MSerial_AsnText<<**rit);
1051 }
1052 string prefix = ParseWGSPrefix(**rit);
1053 if ( !prefix.empty() ) {
1054 if ( s_DebugEnabled(eDebug_resolve) ) {
1055 LOG_POST_X(18, "CWGSResolver_ID2: WGS prefix: "<<prefix);
1056 }
1057 prefixes.push_back(prefix);
1058 break;
1059 }
1060 }
1061 string& save = m_Cache[id_str];
1062 if ( !prefixes.empty() ) {
1063 save = prefixes[0];
1064 }
1065 return prefixes;
1066 }
1067
1068 #endif //WGS_RESOLVER_USE_ID2_CLIENT
1069
1070
1071 END_NAMESPACE(objects);
1072 END_NCBI_NAMESPACE;
1073