1 /* $Id: wgsmaster.cpp 628884 2021-04-07 12:54:35Z ivanov $
2 * ===========================================================================
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 * ===========================================================================
23 *
24 * Author: Eugene Vasilchenko
25 *
26 * File Description: blob stream processor interface
27 *
28 */
29
30 #include <ncbi_pch.hpp>
31
32 #include <objtools/data_loaders/genbank/impl/wgsmaster.hpp>
33 #include <objtools/data_loaders/genbank/blob_id.hpp>
34 #include <objtools/error_codes.hpp>
35 #include <objmgr/data_loader.hpp>
36 #include <objmgr/impl/tse_split_info.hpp>
37 #include <objmgr/impl/tse_chunk_info.hpp>
38 #include <objmgr/impl/tse_loadlock.hpp>
39 #include <objmgr/impl/bioseq_set_info.hpp>
40 #include <objmgr/objmgr_exception.hpp>
41 #include <objects/general/general__.hpp>
42
43
44 #define NCBI_USE_ERRCODE_X Objtools_Rd_Process
45
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(objects)
48
49
50 const bool kAddMasterDescrToTSE = true;
51
52
53 BEGIN_LOCAL_NAMESPACE;
54
55
56 static
s_GoodLetters(CTempString s)57 bool s_GoodLetters(CTempString s) {
58 ITERATE ( CTempString, it, s ) {
59 if ( !isalpha(*it & 0xff) ) {
60 return false;
61 }
62 }
63 return true;
64 }
65
66
67 static
s_GoodDigits(CTempString s)68 bool s_GoodDigits(CTempString s) {
69 bool have_non_zero = false;
70 ITERATE ( CTempString, it, s ) {
71 if ( *it != '0' ) {
72 have_non_zero = true;
73 if ( !isdigit(*it & 0xff) ) {
74 return false;
75 }
76 }
77 }
78 return have_non_zero;
79 }
80
81
82
83
84 static const int kForceDescrMask = ((1<<CSeqdesc::e_User));
85
86 static const int kRefSeqOptionalDescrMask = ((1<<CSeqdesc::e_Pub) |
87 (1<<CSeqdesc::e_Comment));
88
89 static const int kOptionalDescrMask = ((1<<CSeqdesc::e_Source) |
90 (1<<CSeqdesc::e_Molinfo) |
91 (1<<CSeqdesc::e_Create_date) |
92 (1<<CSeqdesc::e_Update_date) |
93 (1<<CSeqdesc::e_Genbank) |
94 (1<<CSeqdesc::e_Embl));
95
96 static const int kGoodDescrMask = kForceDescrMask | kRefSeqOptionalDescrMask | kOptionalDescrMask;
97
98
99 static
s_IsGoodDescr(const CSeqdesc & desc,int mask,const TUserObjectTypesSet & uo_types)100 bool s_IsGoodDescr(const CSeqdesc& desc, int mask, const TUserObjectTypesSet& uo_types)
101 {
102 if ( desc.Which() == CSeqdesc::e_User ) {
103 const CObject_id& type = desc.GetUser().GetType();
104 if ( type.Which() == CObject_id::e_Str ) {
105 string name = type.GetStr();
106 // Only a few user object types are eligible to be taken from master
107 if ( name == "DBLink" ||
108 name == "GenomeProjectsDB" ||
109 name == "StructuredComment" ||
110 name == "FeatureFetchPolicy" ||
111 name == "Unverified" ) {
112 // For StructuredComment, extract the comment prefix and add to the name
113 if (name == "StructuredComment") {
114 // This loop should normally stop on the first iteration...
115 ITERATE (CUser_object::TData, it, desc.GetUser().GetData()) {
116 if ((*it)->GetLabel().IsStr() &&
117 (*it)->GetLabel().GetStr() == "StructuredCommentPrefix") {
118 string data = ((*it)->GetData().IsStr() ?
119 (string) (*it)->GetData().GetStr() :
120 NStr::IntToString((*it)->GetData().GetInt()));
121 name += "|" + data;
122 break;
123 }
124 }
125 }
126 // Check if this user object type should be skipped because it already exists
127 if (uo_types.count(name) == 0)
128 return true;
129 }
130 }
131 }
132 else if ( (1 << desc.Which()) & mask ) {
133 return true;
134 }
135 return false;
136 }
137
138
139
140
141 END_LOCAL_NAMESPACE;
142
143
GetWGSMasterSeq_id(const CSeq_id_Handle & idh)144 CSeq_id_Handle CWGSMasterSupport::GetWGSMasterSeq_id(const CSeq_id_Handle& idh)
145 {
146 CSeq_id_Handle master_idh;
147
148 switch ( idh.Which() ) { // shortcut to exclude all non Textseq-id types
149 case CSeq_id::e_not_set:
150 case CSeq_id::e_Local:
151 case CSeq_id::e_Gi:
152 case CSeq_id::e_Gibbsq:
153 case CSeq_id::e_Gibbmt:
154 case CSeq_id::e_Giim:
155 case CSeq_id::e_Patent:
156 case CSeq_id::e_General:
157 case CSeq_id::e_Pdb:
158 return master_idh;
159 default:
160 break;
161 }
162
163 CConstRef<CSeq_id> id = idh.GetSeqId();
164 const CTextseq_id* text_id = id->GetTextseq_Id();
165 if ( !text_id || !text_id->IsSetAccession() ) {
166 return master_idh;
167 }
168
169 CTempString acc = text_id->GetAccession();
170
171 CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc);
172 bool is_cage_ddbj = false;
173 switch ( type & CSeq_id::eAcc_division_mask ) {
174 // accepted accession types
175 case CSeq_id::eAcc_mga: // 2019/02/08 : For now, it's just CAGE DDBJ
176 is_cage_ddbj = true;
177 case CSeq_id::eAcc_wgs:
178 case CSeq_id::eAcc_wgs_intermed:
179 case CSeq_id::eAcc_tsa:
180 break;
181 default:
182 return master_idh;
183 }
184
185 SIZE_TYPE digits_pos = acc.find_first_of("0123456789");
186 bool have_nz = NStr::StartsWith(acc, "NZ_");
187 SIZE_TYPE letters_pos = (have_nz ? 3 : 0);
188
189 // First check the prefix and suffix lengths.
190 // WGS/TSA/TLS prefixes have 4 or 6 letters; CAGE DDBJ prefixes have 5 letters
191 // WGS/TSA/TLS suffixes have 8-10 or 9-11 digits (including 2-digit version);
192 // CAGE DDBJ suffixes have 7 digits
193 SIZE_TYPE min_digits = 0;
194 SIZE_TYPE max_digits = 0;
195
196 if (is_cage_ddbj) {
197 if (digits_pos != 5)
198 return master_idh;
199 min_digits = 7;
200 max_digits = 7;
201 } else {
202 if (digits_pos != letters_pos+4 && digits_pos != letters_pos+6)
203 return master_idh;
204 min_digits = ((digits_pos == letters_pos+4) ? 8 : 9);
205 max_digits = min_digits + 2;
206 }
207
208 SIZE_TYPE digits_count = acc.size() - digits_pos;
209 if (digits_count < min_digits || digits_count > max_digits)
210 return master_idh;
211
212 // Check that prefix and suffix actually consist of letters and digits respectively.
213 if ( !s_GoodLetters(acc.substr(letters_pos, digits_pos-letters_pos)) ) {
214 return master_idh;
215 }
216 if ( !s_GoodDigits(acc.substr(digits_pos)) ) {
217 return master_idh;
218 }
219
220 // Exclude master accessions
221 // Non-CAGE-DDBJ master accessions may also contain a 2-digit version
222 int version = 0;
223 Uint8 row_id = 0;
224 if (is_cage_ddbj) {
225 version = 1;
226 row_id = NStr::StringToNumeric<Uint8>(acc.substr(digits_pos));
227 } else {
228 version = NStr::StringToNumeric<int>(acc.substr(digits_pos, 2));
229 row_id = NStr::StringToNumeric<Uint8>(acc.substr(digits_pos+2));
230 }
231 if ( !version || !row_id ) {
232 return master_idh;
233 }
234
235 CSeq_id master_id;
236 master_id.Assign(*id);
237 CTextseq_id* master_text_id =
238 const_cast<CTextseq_id*>(master_id.GetTextseq_Id());
239 string master_acc = acc.substr(0, digits_pos);
240 master_acc.resize(acc.size(), '0');
241 master_text_id->Reset();
242 master_text_id->SetAccession(master_acc);
243 master_text_id->SetVersion(version);
244 master_idh = CSeq_id_Handle::GetHandle(master_id);
245 return master_idh;
246 }
247
248
249 CWGSMasterSupport::EDescrType
GetDescrType(const CSeq_id_Handle & master_seq_idh)250 CWGSMasterSupport::GetDescrType(const CSeq_id_Handle& master_seq_idh)
251 {
252 return master_seq_idh.Which() == CSeq_id::e_Other? eDescrTypeRefSeq: eDescrTypeDefault;
253 }
254
255
GetForceDescrMask(EDescrType type)256 int CWGSMasterSupport::GetForceDescrMask(EDescrType type)
257 {
258 int force_mask = kForceDescrMask;
259 if ( type != eDescrTypeRefSeq ) {
260 force_mask |= kRefSeqOptionalDescrMask;
261 }
262 return force_mask;
263 }
264
265
GetOptionalDescrMask(EDescrType type)266 int CWGSMasterSupport::GetOptionalDescrMask(EDescrType type)
267 {
268 int optional_mask = kForceDescrMask;
269 if ( type == eDescrTypeRefSeq ) {
270 optional_mask |= kRefSeqOptionalDescrMask;
271 }
272 return optional_mask;
273 }
274
275
AddMasterDescr(CBioseq_Info & seq,const CSeq_descr & src,EDescrType type)276 void CWGSMasterSupport::AddMasterDescr(CBioseq_Info& seq,
277 const CSeq_descr& src,
278 EDescrType type)
279 {
280 int existing_mask = 0;
281 CSeq_descr::Tdata& dst = seq.x_SetDescr().Set();
282 ITERATE ( CSeq_descr::Tdata, it, dst ) {
283 const CSeqdesc& desc = **it;
284 existing_mask |= 1 << desc.Which();
285 }
286 int force_mask = GetForceDescrMask(type);
287 int optional_mask = GetOptionalDescrMask(type);
288 ITERATE ( CSeq_descr::Tdata, it, src.Get() ) {
289 int mask = 1 << (*it)->Which();
290 if ( mask & optional_mask ) {
291 if ( mask & existing_mask ) {
292 continue;
293 }
294 }
295 else if ( !(mask & force_mask) ) {
296 continue;
297 }
298 dst.push_back(*it);
299 }
300 }
301
302
HasMasterId(const CBioseq_Info & seq,const CSeq_id_Handle & master_idh)303 bool CWGSMasterSupport::HasMasterId(const CBioseq_Info& seq, const CSeq_id_Handle& master_idh)
304 {
305 if ( master_idh ) {
306 const CBioseq_Info::TId& ids = seq.GetId();
307 ITERATE ( CBioseq_Info::TId, it, ids ) {
308 if ( GetWGSMasterSeq_id(*it) == master_idh ) {
309 return true;
310 }
311 }
312 }
313 return false;
314 }
315
316
GetWGSMasterDescr(CDataLoader * loader,const CSeq_id_Handle & master_idh,int mask,TUserObjectTypesSet & uo_types)317 CRef<CSeq_descr> CWGSMasterSupport::GetWGSMasterDescr(CDataLoader* loader,
318 const CSeq_id_Handle& master_idh,
319 int mask, TUserObjectTypesSet& uo_types)
320 {
321 CRef<CSeq_descr> ret;
322 CDataLoader::TTSE_LockSet locks =
323 loader->GetRecordsNoBlobState(master_idh, CDataLoader::eBioseqCore);
324 ITERATE ( CDataLoader::TTSE_LockSet, it, locks ) {
325 CConstRef<CBioseq_Info> bs_info =
326 (*it)->FindMatchingBioseq(master_idh);
327 if ( !bs_info ) {
328 continue;
329 }
330 if ( bs_info->IsSetDescr() ) {
331 const CSeq_descr::Tdata& descr = bs_info->GetDescr().Get();
332 ITERATE ( CSeq_descr::Tdata, it, descr ) {
333 if ( s_IsGoodDescr(**it, mask, uo_types) ) {
334 if ( !ret ) {
335 ret = new CSeq_descr;
336 }
337 ret->Set().push_back(*it);
338 }
339 }
340 }
341 break;
342 }
343 return ret;
344 }
345
346
LoadWGSMaster(CDataLoader * loader,CRef<CTSE_Chunk_Info> chunk)347 void CWGSMasterSupport::LoadWGSMaster(CDataLoader* loader,
348 CRef<CTSE_Chunk_Info> chunk)
349 {
350 CWGSMasterChunkInfo& chunk_info =
351 dynamic_cast<CWGSMasterChunkInfo&>(*chunk);
352 CSeq_id_Handle id = chunk_info.m_MasterId;
353 int mask = chunk_info.m_DescrMask;
354 CRef<CSeq_descr> descr =
355 GetWGSMasterDescr(loader, id, mask, chunk_info.m_UserObjectTypes);
356 if ( descr ) {
357 if ( kAddMasterDescrToTSE ) {
358 chunk->x_LoadDescr(CTSE_Chunk_Info::TPlace(), *descr);
359 }
360 else {
361 CRef<CBioseqUpdater> upd(new CWGSBioseqUpdaterDescr(id, descr));
362 const_cast<CTSE_Split_Info&>(chunk->GetSplitInfo()).x_SetBioseqUpdater(upd);
363 }
364 }
365 chunk->SetLoaded();
366 }
367
368
AddWGSMaster(CTSE_LoadLock & lock)369 void CWGSMasterSupport::AddWGSMaster(CTSE_LoadLock& lock)
370 {
371 CTSE_Info::TSeqIds ids;
372 lock->GetBioseqsIds(ids);
373 ITERATE ( CTSE_Info::TSeqIds, it, ids ) {
374 if ( CSeq_id_Handle id = GetWGSMasterSeq_id(*it) ) {
375 int mask = kGoodDescrMask;
376 TUserObjectTypesSet existing_uo_types;
377 if ( kAddMasterDescrToTSE ) {
378 // exclude existing descr types except forced ones (User, Pub, Comment)
379 int force_descr = GetForceDescrMask(GetDescrType(id));
380 mask &= ~lock->x_GetBaseInfo().x_GetExistingDescrMask() | force_descr;
381 lock->x_GetBaseInfo().x_AddExistingUserObjectTypes(existing_uo_types);
382 if ( lock->IsSet() ) {
383 if ( auto first_entry = lock->GetSet().GetFirstEntry() ) {
384 mask &= ~first_entry->x_GetBaseInfo().x_GetExistingDescrMask() | force_descr;
385 first_entry->x_GetBaseInfo().x_AddExistingUserObjectTypes(existing_uo_types);
386 }
387 }
388 }
389 CRef<CTSE_Chunk_Info> chunk(new CWGSMasterChunkInfo(id, mask, existing_uo_types));
390 lock->GetSplitInfo().AddChunk(*chunk);
391 if ( kAddMasterDescrToTSE ) {
392 chunk->x_AddDescInfo(mask, 0);
393 }
394 else {
395 CRef<CBioseqUpdater> upd(new CWGSBioseqUpdaterChunk(id));
396 lock->SetBioseqUpdater(upd);
397 }
398 break;
399 }
400 }
401 }
402
403
CWGSBioseqUpdater_Base(const CSeq_id_Handle & master_idh)404 CWGSBioseqUpdater_Base::CWGSBioseqUpdater_Base(const CSeq_id_Handle& master_idh)
405 : m_MasterId(master_idh)
406 {
407 }
408
409
~CWGSBioseqUpdater_Base()410 CWGSBioseqUpdater_Base::~CWGSBioseqUpdater_Base()
411 {
412 }
413
414
CWGSMasterChunkInfo(const CSeq_id_Handle & master_idh,int mask,TUserObjectTypesSet & uo_types)415 CWGSMasterChunkInfo::CWGSMasterChunkInfo(const CSeq_id_Handle& master_idh,
416 int mask, TUserObjectTypesSet& uo_types)
417 : CTSE_Chunk_Info(kMasterWGS_ChunkId),
418 m_MasterId(master_idh),
419 m_DescrMask(mask),
420 m_UserObjectTypes(move(uo_types))
421 {
422 }
423
424
~CWGSMasterChunkInfo()425 CWGSMasterChunkInfo::~CWGSMasterChunkInfo()
426 {
427 }
428
429
CWGSBioseqUpdaterChunk(const CSeq_id_Handle & master_idh)430 CWGSBioseqUpdaterChunk::CWGSBioseqUpdaterChunk(const CSeq_id_Handle& master_idh)
431 : CWGSBioseqUpdater_Base(master_idh)
432 {
433 }
434
435
~CWGSBioseqUpdaterChunk()436 CWGSBioseqUpdaterChunk::~CWGSBioseqUpdaterChunk()
437 {
438 }
439
440
Update(CBioseq_Info & seq)441 void CWGSBioseqUpdaterChunk::Update(CBioseq_Info& seq)
442 {
443 if ( HasMasterId(seq) ) {
444 // register master descr chunk
445 seq.x_AddDescrChunkId(kGoodDescrMask, kMasterWGS_ChunkId);
446 }
447 }
448
449
CWGSBioseqUpdaterDescr(const CSeq_id_Handle & master_idh,CRef<CSeq_descr> descr)450 CWGSBioseqUpdaterDescr::CWGSBioseqUpdaterDescr(const CSeq_id_Handle& master_idh,
451 CRef<CSeq_descr> descr)
452 : CWGSBioseqUpdater_Base(master_idh),
453 m_Descr(descr)
454 {
455 }
456
457
~CWGSBioseqUpdaterDescr()458 CWGSBioseqUpdaterDescr::~CWGSBioseqUpdaterDescr()
459 {
460 }
461
462
Update(CBioseq_Info & seq)463 void CWGSBioseqUpdaterDescr::Update(CBioseq_Info& seq)
464 {
465 if ( m_Descr &&
466 seq.x_NeedUpdate(seq.fNeedUpdate_descr) &&
467 HasMasterId(seq) ) {
468 AddMasterDescr(seq, *m_Descr, GetDescrType(GetMasterId()));
469 }
470 }
471
472
473 END_SCOPE(objects)
474 END_NCBI_SCOPE
475