1 /*  $Id: wgsmaster.cpp 628884 2021-04-07 12:54:35Z ivanov $
2  * ===========================================================================
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  * ===========================================================================
23  *
24  *  Author:  Eugene Vasilchenko
25  *
26  *  File Description: blob stream processor interface
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <objtools/data_loaders/genbank/impl/wgsmaster.hpp>
33 #include <objtools/data_loaders/genbank/blob_id.hpp>
34 #include <objtools/error_codes.hpp>
35 #include <objmgr/data_loader.hpp>
36 #include <objmgr/impl/tse_split_info.hpp>
37 #include <objmgr/impl/tse_chunk_info.hpp>
38 #include <objmgr/impl/tse_loadlock.hpp>
39 #include <objmgr/impl/bioseq_set_info.hpp>
40 #include <objmgr/objmgr_exception.hpp>
41 #include <objects/general/general__.hpp>
42 
43 
44 #define NCBI_USE_ERRCODE_X   Objtools_Rd_Process
45 
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(objects)
48 
49 
50 const bool kAddMasterDescrToTSE = true;
51 
52 
53 BEGIN_LOCAL_NAMESPACE;
54 
55 
56 static
s_GoodLetters(CTempString s)57 bool s_GoodLetters(CTempString s) {
58     ITERATE ( CTempString, it, s ) {
59         if ( !isalpha(*it & 0xff) ) {
60             return false;
61         }
62     }
63     return true;
64 }
65 
66 
67 static
s_GoodDigits(CTempString s)68 bool s_GoodDigits(CTempString s) {
69     bool have_non_zero = false;
70     ITERATE ( CTempString, it, s ) {
71         if ( *it != '0' ) {
72             have_non_zero = true;
73             if ( !isdigit(*it & 0xff) ) {
74                 return false;
75             }
76         }
77     }
78     return have_non_zero;
79 }
80 
81 
82 
83 
84 static const int kForceDescrMask = ((1<<CSeqdesc::e_User));
85 
86 static const int kRefSeqOptionalDescrMask = ((1<<CSeqdesc::e_Pub) |
87                                              (1<<CSeqdesc::e_Comment));
88 
89 static const int kOptionalDescrMask = ((1<<CSeqdesc::e_Source) |
90                                        (1<<CSeqdesc::e_Molinfo) |
91                                        (1<<CSeqdesc::e_Create_date) |
92                                        (1<<CSeqdesc::e_Update_date) |
93                                        (1<<CSeqdesc::e_Genbank) |
94                                        (1<<CSeqdesc::e_Embl));
95 
96 static const int kGoodDescrMask = kForceDescrMask | kRefSeqOptionalDescrMask | kOptionalDescrMask;
97 
98 
99 static
s_IsGoodDescr(const CSeqdesc & desc,int mask,const TUserObjectTypesSet & uo_types)100 bool s_IsGoodDescr(const CSeqdesc& desc, int mask, const TUserObjectTypesSet& uo_types)
101 {
102     if ( desc.Which() == CSeqdesc::e_User ) {
103         const CObject_id& type = desc.GetUser().GetType();
104         if ( type.Which() == CObject_id::e_Str ) {
105             string name = type.GetStr();
106             // Only a few user object types are eligible to be taken from master
107             if ( name == "DBLink" ||
108                  name == "GenomeProjectsDB" ||
109                  name == "StructuredComment" ||
110                  name == "FeatureFetchPolicy" ||
111                  name == "Unverified" ) {
112                 // For StructuredComment, extract the comment prefix and add to the name
113                 if (name == "StructuredComment") {
114                     // This loop should normally stop on the first iteration...
115                     ITERATE (CUser_object::TData, it, desc.GetUser().GetData()) {
116                         if ((*it)->GetLabel().IsStr() &&
117                             (*it)->GetLabel().GetStr() == "StructuredCommentPrefix") {
118                             string data = ((*it)->GetData().IsStr() ?
119                                            (string) (*it)->GetData().GetStr() :
120                                            NStr::IntToString((*it)->GetData().GetInt()));
121                             name += "|" + data;
122                             break;
123                         }
124                     }
125                 }
126                 // Check if this user object type should be skipped because it already exists
127                 if (uo_types.count(name) == 0)
128                     return true;
129             }
130         }
131     }
132     else if ( (1 << desc.Which()) & mask ) {
133         return true;
134     }
135     return false;
136 }
137 
138 
139 
140 
141 END_LOCAL_NAMESPACE;
142 
143 
GetWGSMasterSeq_id(const CSeq_id_Handle & idh)144 CSeq_id_Handle CWGSMasterSupport::GetWGSMasterSeq_id(const CSeq_id_Handle& idh)
145 {
146     CSeq_id_Handle master_idh;
147 
148     switch ( idh.Which() ) { // shortcut to exclude all non Textseq-id types
149     case CSeq_id::e_not_set:
150     case CSeq_id::e_Local:
151     case CSeq_id::e_Gi:
152     case CSeq_id::e_Gibbsq:
153     case CSeq_id::e_Gibbmt:
154     case CSeq_id::e_Giim:
155     case CSeq_id::e_Patent:
156     case CSeq_id::e_General:
157     case CSeq_id::e_Pdb:
158         return master_idh;
159     default:
160         break;
161     }
162 
163     CConstRef<CSeq_id> id = idh.GetSeqId();
164     const CTextseq_id* text_id = id->GetTextseq_Id();
165     if ( !text_id || !text_id->IsSetAccession() ) {
166         return master_idh;
167     }
168 
169     CTempString acc = text_id->GetAccession();
170 
171     CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc);
172     bool is_cage_ddbj = false;
173     switch ( type & CSeq_id::eAcc_division_mask ) {
174         // accepted accession types
175     case CSeq_id::eAcc_mga: // 2019/02/08 : For now, it's just CAGE DDBJ
176         is_cage_ddbj = true;
177     case CSeq_id::eAcc_wgs:
178     case CSeq_id::eAcc_wgs_intermed:
179     case CSeq_id::eAcc_tsa:
180         break;
181     default:
182         return master_idh;
183     }
184 
185     SIZE_TYPE digits_pos = acc.find_first_of("0123456789");
186     bool have_nz = NStr::StartsWith(acc, "NZ_");
187     SIZE_TYPE letters_pos = (have_nz ? 3 : 0);
188 
189     // First check the prefix and suffix lengths.
190     // WGS/TSA/TLS prefixes have 4 or 6 letters; CAGE DDBJ prefixes have 5 letters
191     // WGS/TSA/TLS suffixes have 8-10 or 9-11 digits (including 2-digit version);
192     // CAGE DDBJ suffixes have 7 digits
193     SIZE_TYPE min_digits = 0;
194     SIZE_TYPE max_digits = 0;
195 
196     if (is_cage_ddbj) {
197         if (digits_pos != 5)
198             return master_idh;
199         min_digits = 7;
200         max_digits = 7;
201     } else {
202         if (digits_pos != letters_pos+4 && digits_pos != letters_pos+6)
203             return master_idh;
204         min_digits = ((digits_pos == letters_pos+4) ? 8 : 9);
205         max_digits = min_digits + 2;
206     }
207 
208     SIZE_TYPE digits_count = acc.size() - digits_pos;
209     if (digits_count < min_digits || digits_count > max_digits)
210         return master_idh;
211 
212     // Check that prefix and suffix actually consist of letters and digits respectively.
213     if ( !s_GoodLetters(acc.substr(letters_pos, digits_pos-letters_pos)) ) {
214         return master_idh;
215     }
216     if ( !s_GoodDigits(acc.substr(digits_pos)) ) {
217         return master_idh;
218     }
219 
220     // Exclude master accessions
221     // Non-CAGE-DDBJ master accessions may also contain a 2-digit version
222     int version = 0;
223     Uint8 row_id = 0;
224     if (is_cage_ddbj) {
225         version = 1;
226         row_id = NStr::StringToNumeric<Uint8>(acc.substr(digits_pos));
227     } else {
228         version = NStr::StringToNumeric<int>(acc.substr(digits_pos, 2));
229         row_id = NStr::StringToNumeric<Uint8>(acc.substr(digits_pos+2));
230     }
231     if ( !version || !row_id ) {
232         return master_idh;
233     }
234 
235     CSeq_id master_id;
236     master_id.Assign(*id);
237     CTextseq_id* master_text_id =
238         const_cast<CTextseq_id*>(master_id.GetTextseq_Id());
239     string master_acc = acc.substr(0, digits_pos);
240     master_acc.resize(acc.size(), '0');
241     master_text_id->Reset();
242     master_text_id->SetAccession(master_acc);
243     master_text_id->SetVersion(version);
244     master_idh = CSeq_id_Handle::GetHandle(master_id);
245     return master_idh;
246 }
247 
248 
249 CWGSMasterSupport::EDescrType
GetDescrType(const CSeq_id_Handle & master_seq_idh)250 CWGSMasterSupport::GetDescrType(const CSeq_id_Handle& master_seq_idh)
251 {
252     return master_seq_idh.Which() == CSeq_id::e_Other? eDescrTypeRefSeq: eDescrTypeDefault;
253 }
254 
255 
GetForceDescrMask(EDescrType type)256 int CWGSMasterSupport::GetForceDescrMask(EDescrType type)
257 {
258     int force_mask = kForceDescrMask;
259     if ( type != eDescrTypeRefSeq ) {
260         force_mask |= kRefSeqOptionalDescrMask;
261     }
262     return force_mask;
263 }
264 
265 
GetOptionalDescrMask(EDescrType type)266 int CWGSMasterSupport::GetOptionalDescrMask(EDescrType type)
267 {
268     int optional_mask = kForceDescrMask;
269     if ( type == eDescrTypeRefSeq ) {
270         optional_mask |= kRefSeqOptionalDescrMask;
271     }
272     return optional_mask;
273 }
274 
275 
AddMasterDescr(CBioseq_Info & seq,const CSeq_descr & src,EDescrType type)276 void CWGSMasterSupport::AddMasterDescr(CBioseq_Info& seq,
277                                        const CSeq_descr& src,
278                                        EDescrType type)
279 {
280     int existing_mask = 0;
281     CSeq_descr::Tdata& dst = seq.x_SetDescr().Set();
282     ITERATE ( CSeq_descr::Tdata, it, dst ) {
283         const CSeqdesc& desc = **it;
284         existing_mask |= 1 << desc.Which();
285     }
286     int force_mask = GetForceDescrMask(type);
287     int optional_mask = GetOptionalDescrMask(type);
288     ITERATE ( CSeq_descr::Tdata, it, src.Get() ) {
289         int mask = 1 << (*it)->Which();
290         if ( mask & optional_mask ) {
291             if ( mask & existing_mask ) {
292                 continue;
293             }
294         }
295         else if ( !(mask & force_mask) ) {
296             continue;
297         }
298         dst.push_back(*it);
299     }
300 }
301 
302 
HasMasterId(const CBioseq_Info & seq,const CSeq_id_Handle & master_idh)303 bool CWGSMasterSupport::HasMasterId(const CBioseq_Info& seq, const CSeq_id_Handle& master_idh)
304 {
305     if ( master_idh ) {
306         const CBioseq_Info::TId& ids = seq.GetId();
307         ITERATE ( CBioseq_Info::TId, it, ids ) {
308             if ( GetWGSMasterSeq_id(*it) == master_idh ) {
309                 return true;
310             }
311         }
312     }
313     return false;
314 }
315 
316 
GetWGSMasterDescr(CDataLoader * loader,const CSeq_id_Handle & master_idh,int mask,TUserObjectTypesSet & uo_types)317 CRef<CSeq_descr> CWGSMasterSupport::GetWGSMasterDescr(CDataLoader* loader,
318                                                       const CSeq_id_Handle& master_idh,
319                                                       int mask, TUserObjectTypesSet& uo_types)
320 {
321     CRef<CSeq_descr> ret;
322     CDataLoader::TTSE_LockSet locks =
323         loader->GetRecordsNoBlobState(master_idh, CDataLoader::eBioseqCore);
324     ITERATE ( CDataLoader::TTSE_LockSet, it, locks ) {
325         CConstRef<CBioseq_Info> bs_info =
326             (*it)->FindMatchingBioseq(master_idh);
327         if ( !bs_info ) {
328             continue;
329         }
330         if ( bs_info->IsSetDescr() ) {
331             const CSeq_descr::Tdata& descr = bs_info->GetDescr().Get();
332             ITERATE ( CSeq_descr::Tdata, it, descr ) {
333                 if ( s_IsGoodDescr(**it, mask, uo_types) ) {
334                     if ( !ret ) {
335                         ret = new CSeq_descr;
336                     }
337                     ret->Set().push_back(*it);
338                 }
339             }
340         }
341         break;
342     }
343     return ret;
344 }
345 
346 
LoadWGSMaster(CDataLoader * loader,CRef<CTSE_Chunk_Info> chunk)347 void CWGSMasterSupport::LoadWGSMaster(CDataLoader* loader,
348                                       CRef<CTSE_Chunk_Info> chunk)
349 {
350     CWGSMasterChunkInfo& chunk_info =
351         dynamic_cast<CWGSMasterChunkInfo&>(*chunk);
352     CSeq_id_Handle id = chunk_info.m_MasterId;
353     int mask = chunk_info.m_DescrMask;
354     CRef<CSeq_descr> descr =
355         GetWGSMasterDescr(loader, id, mask, chunk_info.m_UserObjectTypes);
356     if ( descr ) {
357         if ( kAddMasterDescrToTSE ) {
358             chunk->x_LoadDescr(CTSE_Chunk_Info::TPlace(), *descr);
359         }
360         else {
361             CRef<CBioseqUpdater> upd(new CWGSBioseqUpdaterDescr(id, descr));
362             const_cast<CTSE_Split_Info&>(chunk->GetSplitInfo()).x_SetBioseqUpdater(upd);
363         }
364     }
365     chunk->SetLoaded();
366 }
367 
368 
AddWGSMaster(CTSE_LoadLock & lock)369 void CWGSMasterSupport::AddWGSMaster(CTSE_LoadLock& lock)
370 {
371     CTSE_Info::TSeqIds ids;
372     lock->GetBioseqsIds(ids);
373     ITERATE ( CTSE_Info::TSeqIds, it, ids ) {
374         if ( CSeq_id_Handle id = GetWGSMasterSeq_id(*it) ) {
375             int mask = kGoodDescrMask;
376             TUserObjectTypesSet existing_uo_types;
377             if ( kAddMasterDescrToTSE ) {
378                 // exclude existing descr types except forced ones (User, Pub, Comment)
379                 int force_descr = GetForceDescrMask(GetDescrType(id));
380                 mask &= ~lock->x_GetBaseInfo().x_GetExistingDescrMask() | force_descr;
381                 lock->x_GetBaseInfo().x_AddExistingUserObjectTypes(existing_uo_types);
382                 if ( lock->IsSet() ) {
383                     if ( auto first_entry = lock->GetSet().GetFirstEntry() ) {
384                         mask &= ~first_entry->x_GetBaseInfo().x_GetExistingDescrMask() | force_descr;
385                         first_entry->x_GetBaseInfo().x_AddExistingUserObjectTypes(existing_uo_types);
386                     }
387                 }
388             }
389             CRef<CTSE_Chunk_Info> chunk(new CWGSMasterChunkInfo(id, mask, existing_uo_types));
390             lock->GetSplitInfo().AddChunk(*chunk);
391             if ( kAddMasterDescrToTSE ) {
392                 chunk->x_AddDescInfo(mask, 0);
393             }
394             else {
395                 CRef<CBioseqUpdater> upd(new CWGSBioseqUpdaterChunk(id));
396                 lock->SetBioseqUpdater(upd);
397             }
398             break;
399         }
400     }
401 }
402 
403 
CWGSBioseqUpdater_Base(const CSeq_id_Handle & master_idh)404 CWGSBioseqUpdater_Base::CWGSBioseqUpdater_Base(const CSeq_id_Handle& master_idh)
405     : m_MasterId(master_idh)
406 {
407 }
408 
409 
~CWGSBioseqUpdater_Base()410 CWGSBioseqUpdater_Base::~CWGSBioseqUpdater_Base()
411 {
412 }
413 
414 
CWGSMasterChunkInfo(const CSeq_id_Handle & master_idh,int mask,TUserObjectTypesSet & uo_types)415 CWGSMasterChunkInfo::CWGSMasterChunkInfo(const CSeq_id_Handle& master_idh,
416                                          int mask, TUserObjectTypesSet& uo_types)
417     : CTSE_Chunk_Info(kMasterWGS_ChunkId),
418       m_MasterId(master_idh),
419       m_DescrMask(mask),
420       m_UserObjectTypes(move(uo_types))
421 {
422 }
423 
424 
~CWGSMasterChunkInfo()425 CWGSMasterChunkInfo::~CWGSMasterChunkInfo()
426 {
427 }
428 
429 
CWGSBioseqUpdaterChunk(const CSeq_id_Handle & master_idh)430 CWGSBioseqUpdaterChunk::CWGSBioseqUpdaterChunk(const CSeq_id_Handle& master_idh)
431     : CWGSBioseqUpdater_Base(master_idh)
432 {
433 }
434 
435 
~CWGSBioseqUpdaterChunk()436 CWGSBioseqUpdaterChunk::~CWGSBioseqUpdaterChunk()
437 {
438 }
439 
440 
Update(CBioseq_Info & seq)441 void CWGSBioseqUpdaterChunk::Update(CBioseq_Info& seq)
442 {
443     if ( HasMasterId(seq) ) {
444         // register master descr chunk
445         seq.x_AddDescrChunkId(kGoodDescrMask, kMasterWGS_ChunkId);
446     }
447 }
448 
449 
CWGSBioseqUpdaterDescr(const CSeq_id_Handle & master_idh,CRef<CSeq_descr> descr)450 CWGSBioseqUpdaterDescr::CWGSBioseqUpdaterDescr(const CSeq_id_Handle& master_idh,
451                                                CRef<CSeq_descr> descr)
452     : CWGSBioseqUpdater_Base(master_idh),
453       m_Descr(descr)
454 {
455 }
456 
457 
~CWGSBioseqUpdaterDescr()458 CWGSBioseqUpdaterDescr::~CWGSBioseqUpdaterDescr()
459 {
460 }
461 
462 
Update(CBioseq_Info & seq)463 void CWGSBioseqUpdaterDescr::Update(CBioseq_Info& seq)
464 {
465     if ( m_Descr &&
466          seq.x_NeedUpdate(seq.fNeedUpdate_descr) &&
467          HasMasterId(seq) ) {
468         AddMasterDescr(seq, *m_Descr, GetDescrType(GetMasterId()));
469     }
470 }
471 
472 
473 END_SCOPE(objects)
474 END_NCBI_SCOPE
475