1 /* $Id: discrepancy_stream.cpp 627092 2021-03-09 14:28:00Z ivanov $
2 * =========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * =========================================================================
25 *
26 * Authors: Sema Kachalo
27 *
28 */
29
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
33 #include <sstream>
34 #include <objmgr/object_manager.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
36 #include <objmgr/util/sequence.hpp>
37 #include <serial/objcopy.hpp>
38 #include <util/compress/stream_util.hpp>
39 #include <util/line_reader.hpp>
40 #include <util/format_guess.hpp>
41
42
43 BEGIN_NCBI_SCOPE
44 BEGIN_SCOPE(NDiscrepancy)
45 USING_SCOPE(objects);
46
47
48 static size_t offset = 0;
Offset()49 string Offset() // LCOV_EXCL_START
50 {
51 return string(offset<<1, ' ');
52 } // LCOV_EXCL_STOP
53
54 class CReadHook_Bioseq_set : public CReadObjectHook
55 {
56 public:
CReadHook_Bioseq_set(CDiscrepancyContext * context)57 CReadHook_Bioseq_set(CDiscrepancyContext* context) : m_Context(context) {}
ReadObject(CObjectIStream & stream,const CObjectInfo & passed_info)58 void ReadObject(CObjectIStream& stream, const CObjectInfo& passed_info) override
59 {
60 if (m_Context->Skip()) {
61 m_Context->PushNode(CDiscrepancyContext::eSeqSet);
62 //cout << Offset() << "Skipping Bioseq_set " << m_Context->m_CurrentNode->m_Index << "\n";
63 m_Context->m_CurrentNode->m_Pos = stream.GetStreamPos();
64 DefaultSkip(stream, passed_info);
65 m_Context->PopNode();
66 }
67 else {
68 bool repeat = m_Context->m_CurrentNode->m_Repeat;
69 m_Context->m_CurrentNode->m_Repeat = false;
70 if (!repeat) {
71 m_Context->PushNode(CDiscrepancyContext::eSeqSet);
72 }
73 //cout << Offset() << "Reading " << m_Context->m_CurrentNode->Path() << "\n";
74 offset++;
75 DefaultRead(stream, passed_info);
76 offset--;
77 //cout << Offset() << "Done " << m_Context->m_CurrentNode->Path() << "\n";
78 m_Context->m_CurrentNode->m_Obj.Reset((CObject*)passed_info.GetObjectPtr());
79 if (!repeat) {
80 m_Context->PopNode();
81 }
82 }
83 }
84 protected:
85 CDiscrepancyContext* m_Context;
86 };
87
88
89 class CReadHook_Bioseq : public CReadObjectHook
90 {
91 public:
CReadHook_Bioseq(CDiscrepancyContext * context)92 CReadHook_Bioseq(CDiscrepancyContext* context) : m_Context(context) {}
ReadObject(CObjectIStream & stream,const CObjectInfo & passed_info)93 void ReadObject(CObjectIStream& stream, const CObjectInfo& passed_info) override
94 {
95 if (m_Context->Skip()) {
96 m_Context->PushNode(CDiscrepancyContext::eBioseq);
97 //cout << Offset() << "Skipping Bioseq " << m_Context->m_CurrentNode->m_Index << "\n";
98 m_Context->m_CurrentNode->m_Pos = stream.GetStreamPos();
99 DefaultSkip(stream, passed_info);
100 m_Context->PopNode();
101 }
102 else {
103 bool repeat = m_Context->m_CurrentNode->m_Repeat;
104 m_Context->m_CurrentNode->m_Repeat = false;
105 if (!repeat) {
106 m_Context->PushNode(CDiscrepancyContext::eBioseq);
107 }
108 //cout << Offset() << "Reading " << m_Context->m_CurrentNode->Path() << "\n";
109 DefaultRead(stream, passed_info);
110 m_Context->m_CurrentNode->m_Obj.Reset((CObject*)passed_info.GetObjectPtr());
111 if (!repeat) {
112 m_Context->PopNode();
113 }
114 }
115 }
116 protected:
117 CDiscrepancyContext* m_Context;
118 };
119
120
121 class CReadHook_Bioseq_set_class : public CReadClassMemberHook
122 {
123 public:
CReadHook_Bioseq_set_class(CDiscrepancyContext * context)124 CReadHook_Bioseq_set_class(CDiscrepancyContext* context) : m_Context(context) {}
ReadClassMember(CObjectIStream & stream,const CObjectInfoMI & passed_info)125 void ReadClassMember(CObjectIStream& stream, const CObjectInfoMI& passed_info) override
126 {
127 DefaultRead(stream, passed_info);
128 const CBioseq_set::TClass& cl = *(const CBioseq_set::TClass*)passed_info.GetMember().GetObjectPtr();
129 switch (cl) {
130 case CBioseq_set::eClass_nuc_prot:
131 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_NucProt);
132 break;
133 case CBioseq_set::eClass_gen_prod_set:
134 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_GenProd);
135 break;
136 case CBioseq_set::eClass_segset:
137 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_SegSet);
138 break;
139 case CBioseq_set::eClass_small_genome_set:
140 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_Genome);
141 break;
142 case CBioseq_set::eClass_eco_set:
143 case CBioseq_set::eClass_mut_set:
144 case CBioseq_set::eClass_phy_set:
145 case CBioseq_set::eClass_pop_set:
146 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_Funny);
147 break;
148 default:
149 break;
150 }
151 }
152
153 protected:
154 CDiscrepancyContext* m_Context;
155 };
156
157
PushNode(EObjType type)158 void CDiscrepancyContext::PushNode(EObjType type)
159 {
160 CRef<CParseNode> new_node(new CParseNode(type, (unsigned)m_CurrentNode->m_Children.size(), m_CurrentNode));
161 m_CurrentNode->m_Children.push_back(new_node);
162 m_CurrentNode.Reset(new_node);
163 }
164
165
Skip()166 bool CDiscrepancyContext::Skip()
167 {
168 // Not skipping the first child or nuc-prot children
169 return m_Skip && m_CurrentNode->m_Type == eSeqSet && !m_CurrentNode->m_Repeat && m_CurrentNode->m_Children.size();
170 }
171
172
ParseStrings(const string & fname)173 void CDiscrepancyContext::ParseStrings(const string& fname)
174 {
175 m_RootNode.Reset(new CParseNode(eFile, 0));
176 m_RootNode->m_Ref->m_Text = fname;
177 m_CurrentNode.Reset(m_RootNode);
178
179 CNcbiIfstream istr(fname);
180 CStreamLineReader line_reader(istr);
181 do {
182 PushNode(eString);
183 m_CurrentNode->m_Ref->m_Text = *++line_reader;
184 RunTests();
185 PopNode();
186 }
187 while (!line_reader.AtEOF());
188 }
189
190
ParseStream(CObjectIStream & stream,const string & fname,bool skip,const string & default_header)191 void CDiscrepancyContext::ParseStream(CObjectIStream& stream, const string& fname, bool skip, const string& default_header)
192 {
193 m_Skip = skip;
194 CObjectTypeInfo(CType<CBioseq_set>()).SetLocalReadHook(stream, new CReadHook_Bioseq_set(this));
195 CObjectTypeInfo(CType<CBioseq_set>()).FindMember("class").SetLocalReadHook(stream, new CReadHook_Bioseq_set_class(this));
196 CObjectTypeInfo(CType<CBioseq>()).SetLocalReadHook(stream, new CReadHook_Bioseq(this));
197
198 m_RootNode.Reset(new CParseNode(eFile, 0));
199 m_RootNode->m_Ref->m_Text = fname;
200 m_CurrentNode.Reset(m_RootNode);
201
202 while (true) {
203 string header = stream.ReadFileHeader();
204 if (header.empty()) {
205 header = default_header;
206 }
207 //cout << "Reading " << header << "\n";
208 PushNode(eNone);
209
210 if (header == CSeq_submit::GetTypeInfo()->GetName()) {
211 PushNode(eSubmit);
212 CRef<CSeq_submit> ss(new CSeq_submit);
213 stream.Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
214 m_CurrentNode->m_Obj.Reset(ss);
215 PopNode();
216 }
217 else if (header == CSeq_entry::GetTypeInfo()->GetName()) {
218 CRef<CSeq_entry> se(new CSeq_entry);
219 stream.Read(ObjectInfo(*se), CObjectIStream::eNoFileHeader);
220 }
221 else if (header == CBioseq_set::GetTypeInfo()->GetName()) {
222 CRef<CBioseq_set> set(new CBioseq_set);
223 stream.Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
224 }
225 else if (header == CBioseq::GetTypeInfo()->GetName()) {
226 CRef<CBioseq> seq(new CBioseq);
227 stream.Read(ObjectInfo(*seq), CObjectIStream::eNoFileHeader);
228 }
229 else {
230 NCBI_THROW(CException, eUnknown, "Unsupported type " + header); // LCOV_EXCL_LINE
231 }
232 CNcbiStreampos position = stream.GetStreamPos();
233 Extend(*m_CurrentNode, stream);
234 if (m_Skip) {
235 stream.SetStreamPos(position);
236 }
237 PopNode();
238 if (stream.EndOfData()) {
239 break;
240 }
241 }
242 }
243
244
Extend(CParseNode & node,CObjectIStream & stream)245 void CDiscrepancyContext::Extend(CParseNode& node, CObjectIStream& stream)
246 {
247 //cout << "Reading " << node.Path() << "\n";
248 bool load = (node.m_Type == eSeqSet_NucProt && !InGenProdSet(&node)) || (node.m_Type == eSeqSet_GenProd && !InNucProtSet(&node)) || (node.m_Type == eBioseq && !InNucProtSet(&node) && !InGenProdSet(&node));
249 if (load) {
250 CRef<CSeq_entry> se(new CSeq_entry());
251 if (node.m_Type == eBioseq) {
252 se->SetSeq((CBioseq&)*node.m_Obj);
253 }
254 else { // node.m_Type == eSeqSet_NucProt
255 se->SetSet((CBioseq_set&)*node.m_Obj);
256 }
257 auto handle = m_Scope->AddTopLevelSeqEntry(*se);
258 m_FeatTree.Reset(new feature::CFeatTree(handle));
259 }
260 Populate(node);
261
262 for (size_t i = 0; i < node.m_Children.size(); i++) {
263 CParseNode& item = *node.m_Children[i];
264 if (!item.m_Obj) {
265 stream.SetStreamPos(item.m_Pos);
266 item.m_Repeat = true;
267 m_CurrentNode.Reset(&item);
268 if (item.m_Type == eBioseq) {
269 CRef<CBioseq> seq(new CBioseq);
270 stream.Read(ObjectInfo(*seq), CObjectIStream::eNoFileHeader);
271 }
272 else if (item.m_Type == eSeqSet) {
273 CRef<CBioseq_set> set(new CBioseq_set);
274 stream.Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
275 }
276 }
277 Extend(item, stream);
278 if (node.m_Type != eSeqSet_NucProt && node.m_Type != eSeqSet_GenProd) {
279 node.m_Children[i].Reset();
280 }
281 }
282
283 //cout << "Running tests on " << node.Path() << " ...\n";
284 m_CurrentNode.Reset(&node);
285 RunTests();
286
287 if (load) {
288 //m_FeatTree.Reset();
289 m_Scope->ResetDataAndHistory();
290 }
291 }
292
293
ParseObject(const CBioseq & root)294 void CDiscrepancyContext::ParseObject(const CBioseq& root)
295 {
296 CRef<CParseNode> current = m_CurrentNode;
297 PushNode(eBioseq);
298 m_CurrentNode->m_Obj.Reset(&root);
299 m_CurrentNode = current;
300 }
301
302
ParseObject(const CBioseq_set & root)303 void CDiscrepancyContext::ParseObject(const CBioseq_set& root)
304 {
305 CRef<CParseNode> current = m_CurrentNode;
306 EObjType type = eSeqSet;
307 if (root.IsSetClass()) {
308 switch (root.GetClass()) {
309 case CBioseq_set::eClass_nuc_prot:
310 type = eSeqSet_NucProt;
311 break;
312 case CBioseq_set::eClass_gen_prod_set:
313 type = eSeqSet_GenProd;
314 break;
315 case CBioseq_set::eClass_segset:
316 type = eSeqSet_SegSet;
317 break;
318 case CBioseq_set::eClass_small_genome_set:
319 type = eSeqSet_Genome;
320 break;
321 case CBioseq_set::eClass_eco_set:
322 case CBioseq_set::eClass_mut_set:
323 case CBioseq_set::eClass_phy_set:
324 case CBioseq_set::eClass_pop_set:
325 type = eSeqSet_Funny;
326 break;
327 default:
328 break;
329 }
330 }
331 PushNode(type);
332 m_CurrentNode->m_Obj.Reset(&root);
333 if (root.CanGetSeq_set()) {
334 for (const auto& entry : root.GetSeq_set()) {
335 ParseObject(*entry);
336 }
337 }
338 m_CurrentNode = current;
339 }
340
341
ParseObject(const CSeq_entry & root)342 void CDiscrepancyContext::ParseObject(const CSeq_entry& root)
343 {
344 if (root.IsSet()) {
345 ParseObject(root.GetSet());
346 }
347 else if (root.IsSeq()) {
348 ParseObject(root.GetSeq());
349 }
350 }
351
352
ParseObject(const CSeq_submit & root)353 void CDiscrepancyContext::ParseObject(const CSeq_submit& root)
354 {
355 CRef<CParseNode> current = m_CurrentNode;
356 PushNode(eSubmit);
357 m_CurrentNode->m_Obj.Reset(&root);
358 if (root.CanGetData() && root.GetData().IsEntrys()) {
359 for (const auto& entry : root.GetData().GetEntrys()) {
360 ParseObject(*entry);
361 }
362 }
363 m_CurrentNode = current;
364 }
365
366
ParseAll(CParseNode & node)367 void CDiscrepancyContext::ParseAll(CParseNode& node)
368 {
369 Populate(node);
370 for (auto& item : node.m_Children) {
371 ParseAll(*item);
372 }
373 m_CurrentNode.Reset(&node);
374 RunTests();
375 }
376
377
Populate(CParseNode & node)378 void CDiscrepancyContext::Populate(CParseNode& node)
379 {
380 switch (node.m_Type) {
381 case eSeqSet:
382 case eSeqSet_NucProt:
383 case eSeqSet_GenProd:
384 case eSeqSet_SegSet:
385 case eSeqSet_Genome:
386 case eSeqSet_Funny:
387 PopulateSeqSet(node);
388 break;
389 case eBioseq:
390 PopulateBioseq(node);
391 break;
392 case eSubmit:
393 PopulateSubmit(node);
394 break;
395 default:
396 break;
397 }
398 }
399
400
PopulateSubmit(CParseNode & node)401 void CDiscrepancyContext::PopulateSubmit(CParseNode& node)
402 {
403 const CSeq_submit& sub = dynamic_cast<const CSeq_submit&>(*node.m_Obj);
404 if (sub.IsSetSub()) {
405 if (sub.GetSub().IsSetCit() && sub.GetSub().GetCit().CanGetAuthors()) {
406 const CAuth_list* auth = &sub.GetSub().GetCit().GetAuthors();
407 node.m_Authors.push_back(auth);
408 node.m_AuthorMap[auth] = &node;
409 }
410 }
411 }
412
413
PopulateBioseq(CParseNode & node)414 void CDiscrepancyContext::PopulateBioseq(CParseNode& node)
415 {
416 const CBioseq& bioseq = dynamic_cast<const CBioseq&>(*node.m_Obj);
417 if (bioseq.CanGetDescr() && bioseq.GetDescr().CanGet()) {
418 for (const auto& desc : bioseq.GetDescr().Get()) {
419 node.AddDescriptor(*desc);
420 if (desc->IsMolinfo()) {
421 node.m_Molinfo.Reset(desc);
422 }
423 else if (desc->IsSource()) {
424 node.m_Biosource.Reset(desc);
425 }
426 else if (desc->IsTitle()) {
427 node.m_Title.Reset(desc);
428 }
429 }
430 }
431 if (bioseq.IsSetAnnot()) {
432 for (const auto& annot : bioseq.GetAnnot()) {
433 if (annot->IsFtable()) {
434 for (const auto& feat : annot->GetData().GetFtable()) {
435 node.AddFeature(*feat);
436 }
437 }
438 }
439 }
440 node.m_BioseqSummary.reset(new CSeqSummary());
441 BuildSeqSummary(bioseq, *node.m_BioseqSummary);
442 string label = node.m_BioseqSummary->Label;
443 node.m_Ref->m_Text = node.m_BioseqSummary->Label + "\n" + node.m_BioseqSummary->GetStats();
444 for (CParseNode* n = node.m_Parent; n; n = n->m_Parent) {
445 if ((!IsSeqSet(n->m_Type) && n->m_Type != eSubmit) || !n->m_Ref->m_Text.empty()) {
446 break;
447 }
448 n->m_Ref->m_Text = n->m_Type == eSeqSet_NucProt || n->m_Type == eSeqSet_SegSet ? node.m_BioseqSummary->Label : label;
449 label = n->m_Ref->GetText();
450 }
451 if (node.m_Biosource) {
452 for (CParseNode* n = node.m_Parent; n && IsSeqSet(n->m_Type); n = n->m_Parent) {
453 if (n->m_Type == eSeqSet_Genome || n->m_Type == eSeqSet_Funny) {
454 n->m_SetBiosources.push_back(node.m_Biosource);
455 }
456 }
457 }
458 }
459
460
PopulateSeqSet(CParseNode & node)461 void CDiscrepancyContext::PopulateSeqSet(CParseNode& node)
462 {
463 const CBioseq_set& seqset = dynamic_cast<const CBioseq_set&>(*node.m_Obj);
464 if (seqset.CanGetDescr() && seqset.GetDescr().CanGet()) {
465 for (const auto& desc : seqset.GetDescr().Get()) {
466 node.AddDescriptor(*desc);
467 if (desc->IsMolinfo()) {
468 node.m_Molinfo.Reset(desc);
469 }
470 else if (desc->IsSource()) {
471 node.m_Biosource.Reset(desc);
472 }
473 else if (desc->IsTitle()) {
474 node.m_Title.Reset(desc);
475 }
476 }
477 }
478 if (seqset.IsSetAnnot()) {
479 for (const auto& annot : seqset.GetAnnot()) {
480 if (annot->IsFtable()) {
481 for (const auto& feat : annot->GetData().GetFtable()) {
482 node.AddFeature(*feat);
483 }
484 }
485 }
486 }
487 if (node.m_Biosource) {
488 for (CParseNode* n = &node; n && IsSeqSet(n->m_Type); n = n->m_Parent) {
489 if (n->m_Type == eSeqSet_Genome) {
490 n->m_SetBiosources.push_back(node.m_Biosource);
491 }
492 }
493 }
494 }
495
496
FindNode(const CRefNode & ref)497 CDiscrepancyContext::CParseNode* CDiscrepancyContext::FindNode(const CRefNode& ref)
498 {
499 auto it = m_NodeMap.find(&ref);
500 if (it != m_NodeMap.end()) {
501 return it->second;
502 }
503 if (ref.m_Parent) {
504 CParseNode* p = FindNode(*ref.m_Parent);
505 if (p) {
506 switch (ref.m_Type) {
507 case eSeqFeat:
508 m_NodeMap[&ref] = p->m_Features[ref.m_Index];
509 break;
510 case eSeqDesc:
511 m_NodeMap[&ref] = p->m_Descriptors[ref.m_Index];
512 break;
513 default:
514 m_NodeMap[&ref] = p->m_Children[ref.m_Index];
515 break;
516 }
517 return m_NodeMap[&ref];
518 }
519 }
520 return nullptr;
521 }
522
523
FindObject(CReportObj & obj,bool alt)524 const CSerialObject* CDiscrepancyContext::FindObject(CReportObj& obj, bool alt)
525 {
526 CDiscrepancyObject& p = static_cast<CDiscrepancyObject&>(obj);
527 CParseNode* node = FindNode(alt ? *p.m_Fix : *p.m_Ref);
528 return node ? dynamic_cast<const CSerialObject*>(&*node->m_Obj) : nullptr;
529 }
530
531
ReplaceObject(CReportObj & obj,CSerialObject * ser,bool alt)532 void CDiscrepancyContext::ReplaceObject(CReportObj& obj, CSerialObject* ser, bool alt)
533 {
534 CDiscrepancyObject* p = static_cast<CDiscrepancyObject*>(&obj);
535 CParseNode* node = FindNode(alt ? *p->m_Fix : *p->m_Ref);
536 node->m_Obj.Reset(ser);
537 }
538
539
ReplaceSeq_feat(CReportObj & obj,const CSeq_feat & old_feat,CSeq_feat & new_feat,bool alt)540 void CDiscrepancyContext::ReplaceSeq_feat(CReportObj& obj, const CSeq_feat& old_feat, CSeq_feat& new_feat, bool alt)
541 {
542 if (m_AF_Seq_annot) {
543 auto& ftable = m_AF_Seq_annot->SetData().SetFtable();
544 for (auto& feat : ftable) {
545 if (&*feat == &old_feat) {
546 feat.Reset(&new_feat);
547 }
548 }
549 }
550 else {
551 CSeq_feat_EditHandle feh(GetScope().GetSeq_featHandle(old_feat));
552 feh.Replace(new_feat);
553 }
554 ReplaceObject(obj, &new_feat, alt);
555 }
556
557
558 // AUTOFIX ////////////////////////////////////////////////////////////////////////
559
560 class CCopyHook_Bioseq_set : public CCopyObjectHook
561 {
562 public:
CCopyHook_Bioseq_set(CDiscrepancyContext * context)563 CCopyHook_Bioseq_set(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)564 void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
565 {
566 m_Context->PushNode(CDiscrepancyContext::eSeqSet);
567 if (m_Context->CanFixBioseq_set()) {
568 m_Context->m_AF_Bioseq_set.Reset(new CBioseq_set);
569 copier.In().ReadObject(m_Context->m_AF_Bioseq_set, passed_info.GetTypeInfo());
570 m_Context->m_CurrentNode->m_Obj.Reset(m_Context->m_AF_Bioseq_set);
571 CRef<CSeq_entry> se(new CSeq_entry());
572 se->SetSet(*m_Context->m_AF_Bioseq_set);
573 auto handle = m_Context->m_Scope->AddTopLevelSeqEntry(*se);
574 m_Context->m_FeatTree.Reset(new feature::CFeatTree(handle));
575 m_Context->AutofixBioseq_set();
576 copier.Out().WriteObject(m_Context->m_AF_Bioseq_set, passed_info.GetTypeInfo());
577 m_Context->m_AF_Bioseq_set.Reset();
578 }
579 else {
580 DefaultCopy(copier, passed_info);
581 }
582 m_Context->PopNode();
583 }
584 protected:
585 CDiscrepancyContext* m_Context;
586 };
587
588
589 class CCopyHook_Bioseq : public CCopyObjectHook
590 {
591 public:
CCopyHook_Bioseq(CDiscrepancyContext * context)592 CCopyHook_Bioseq(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)593 void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
594 {
595 m_Context->PushNode(CDiscrepancyContext::eBioseq);
596 if (m_Context->CanFixBioseq()) {
597 m_Context->m_AF_Bioseq.Reset(new CBioseq);
598 copier.In().ReadObject(m_Context->m_AF_Bioseq, passed_info.GetTypeInfo());
599 m_Context->m_CurrentNode->m_Obj.Reset(m_Context->m_AF_Bioseq);
600 CRef<CSeq_entry> se(new CSeq_entry());
601 se->SetSeq(*m_Context->m_AF_Bioseq);
602 auto handle = m_Context->m_Scope->AddTopLevelSeqEntry(*se);
603 m_Context->m_FeatTree.Reset(new feature::CFeatTree(handle));
604 m_Context->AutofixBioseq();
605 copier.Out().WriteObject(m_Context->m_AF_Bioseq, passed_info.GetTypeInfo());
606 m_Context->m_AF_Bioseq.Reset();
607 }
608 else {
609 DefaultCopy(copier, passed_info);
610 }
611 m_Context->PopNode();
612 }
613 protected:
614 CDiscrepancyContext* m_Context;
615 };
616
617
618 class CCopyHook_Seq_descr : public CCopyObjectHook
619 {
620 public:
CCopyHook_Seq_descr(CDiscrepancyContext * context)621 CCopyHook_Seq_descr(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)622 void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
623 {
624 if (m_Context->CanFixSeqdesc()) {
625 m_Context->m_AF_Seq_descr.Reset(new CSeq_descr);
626 copier.In().ReadObject(m_Context->m_AF_Seq_descr, passed_info.GetTypeInfo());
627 m_Context->AutofixSeq_descr();
628 copier.Out().WriteObject(m_Context->m_AF_Seq_descr, passed_info.GetTypeInfo());
629 m_Context->m_AF_Seq_descr.Reset();
630 }
631 else {
632 DefaultCopy(copier, passed_info);
633 }
634 }
635 protected:
636 CDiscrepancyContext* m_Context;
637 };
638
639
640 class CCopyHook_Seq_annot : public CCopyObjectHook
641 {
642 public:
CCopyHook_Seq_annot(CDiscrepancyContext * context)643 CCopyHook_Seq_annot(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)644 void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
645 {
646 if (m_Context->CanFixSeq_annot()) {
647 m_Context->m_AF_Seq_annot.Reset(new CSeq_annot);
648 copier.In().ReadObject(m_Context->m_AF_Seq_annot, passed_info.GetTypeInfo());
649 m_Context->AutofixSeq_annot();
650 copier.Out().WriteObject(m_Context->m_AF_Seq_annot, passed_info.GetTypeInfo());
651 m_Context->m_AF_Seq_annot.Reset();
652 }
653 else {
654 DefaultCopy(copier, passed_info);
655 }
656 }
657 protected:
658 CDiscrepancyContext* m_Context;
659 };
660
661
662 class CCopyHook_Submit_block : public CCopyObjectHook
663 {
664 public:
CCopyHook_Submit_block(CDiscrepancyContext * context)665 CCopyHook_Submit_block(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)666 void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
667 {
668 if (m_Context->CanFixSubmit_block()) {
669 m_Context->m_AF_Submit_block.Reset(new CSubmit_block);
670 copier.In().ReadObject(m_Context->m_AF_Submit_block, passed_info.GetTypeInfo());
671 m_Context->AutofixSubmit_block();
672 copier.Out().WriteObject(m_Context->m_AF_Submit_block, passed_info.GetTypeInfo());
673 m_Context->m_AF_Submit_block.Reset();
674 }
675 else {
676 DefaultCopy(copier, passed_info);
677 }
678 }
679 protected:
680 CDiscrepancyContext* m_Context;
681 };
682
683
OpenUncompressedStream(const string & fname,bool & compressed)684 unique_ptr<CObjectIStream> OpenUncompressedStream(const string& fname, bool& compressed) // One more copy!!!
685 {
686 unique_ptr<CNcbiIstream> InputStream(new CNcbiIfstream(fname, ios::binary));
687 CCompressStream::EMethod method;
688
689 CFormatGuess::EFormat format = CFormatGuess::Format(*InputStream);
690 switch (format) {
691 case CFormatGuess::eGZip: method = CCompressStream::eGZipFile; break;
692 case CFormatGuess::eBZip2: method = CCompressStream::eBZip2; break;
693 case CFormatGuess::eLzo: method = CCompressStream::eLZO; break;
694 default: method = CCompressStream::eNone; break;
695 }
696 compressed = method != CCompressStream::eNone;
697 if (compressed) {
698 InputStream.reset(new CDecompressIStream(*InputStream.release(), method, CCompressStream::fDefault, eTakeOwnership));
699 format = CFormatGuess::Format(*InputStream);
700 }
701
702 unique_ptr<CObjectIStream> objectStream;
703 switch (format)
704 {
705 case CFormatGuess::eBinaryASN:
706 case CFormatGuess::eTextASN:
707 objectStream.reset(CObjectIStream::Open(format == CFormatGuess::eBinaryASN ? eSerial_AsnBinary : eSerial_AsnText, *InputStream.release(), eTakeOwnership));
708 break;
709 default:
710 break;
711 }
712 objectStream->SetDelayBufferParsingPolicy(CObjectIStream::eDelayBufferPolicyAlwaysParse);
713 return objectStream;
714 }
715
716
Autofix(TReportObjectList & tofix,map<string,size_t> & rep,const string & default_header)717 void CDiscrepancyContext::Autofix(TReportObjectList& tofix, map<string, size_t>& rep, const string& default_header)
718 {
719 if (!tofix.empty()) {
720 sort(tofix.begin(), tofix.end(), CompareRefs);
721 bool in_file = false;
722 for (const CRefNode* node = static_cast<CDiscrepancyObject&>(*tofix[0]).m_Fix; node; node = node->m_Parent) {
723 if (node->m_Type == eFile) in_file = true;
724 }
725 if (!in_file) { // GBench etc. -- all objects already in the scope
726 for (auto& fix : tofix) {
727 CDiscrepancyObject& obj = static_cast<CDiscrepancyObject&>(*fix);
728 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*obj.m_Case).Autofix(&obj, *this);
729 if (result) {
730 rep[result->GetS()] += result->GetN();
731 }
732 }
733 return;
734 }
735
736 vector<vector<CDiscrepancyObject*>> all_fixes;
737 string current_path;
738 for (auto& fix : tofix) {
739 string path;
740 CDiscrepancyObject& obj = static_cast<CDiscrepancyObject&>(*fix);
741 for (const CRefNode* node = obj.m_Fix; node; node = node->m_Parent) {
742 if (node->m_Type == eFile) {
743 path = node->m_Text;
744 break;
745 }
746 }
747 if (path != current_path) {
748 current_path = path;
749 vector<CDiscrepancyObject*> fixes;
750 all_fixes.push_back(fixes);
751 }
752 all_fixes.back().push_back(&obj);
753 }
754 for (auto& fix : all_fixes) {
755 AutofixFile(fix, default_header);
756 }
757 }
758 }
759
760
AutofixFile(vector<CDiscrepancyObject * > & fixes,const string & default_header)761 void CDiscrepancyContext::AutofixFile(vector<CDiscrepancyObject*>&fixes, const string& default_header)
762 {
763 string path;
764 for (CRefNode* node = fixes[0]->m_Fix; node; node = node->m_Parent) {
765 if (node->m_Type == eFile) {
766 path = node->m_Text;
767 break;
768 }
769 }
770 bool compressed = false;
771 unique_ptr<CObjectIStream> in = OpenUncompressedStream(path, compressed);
772 cout << "Autofixing " << path << "\n";
773
774 size_t dot = path.find_last_of('.');
775 if (dot != string::npos) {
776 size_t slash = path.find_last_of("/\\");
777 if (slash != string::npos && slash >= dot) {
778 dot = string::npos;
779 }
780 }
781 string fixed_path = !compressed && (dot != string::npos) ? path.substr(0, dot) + ".autofix" + path.substr(dot) : path + ".autofix.sqn";
782
783 string header = in->ReadFileHeader();
784 in = OpenUncompressedStream(path, compressed);
785 unique_ptr<CObjectOStream> out(CObjectOStream::Open(eSerial_AsnText, fixed_path));
786 CObjectStreamCopier copier(*in, *out);
787
788 m_Fixes = &fixes;
789 m_RootNode.Reset(new CParseNode(eFile, 0));
790 m_CurrentNode.Reset(m_RootNode);
791
792 CObjectTypeInfo(CType<CBioseq_set>()).SetLocalCopyHook(copier, new CCopyHook_Bioseq_set(this));
793 CObjectTypeInfo(CType<CBioseq>()).SetLocalCopyHook(copier, new CCopyHook_Bioseq(this));
794 CObjectTypeInfo(CType<CSeq_descr>()).SetLocalCopyHook(copier, new CCopyHook_Seq_descr(this));
795 CObjectTypeInfo(CType<CSeq_annot>()).SetLocalCopyHook(copier, new CCopyHook_Seq_annot(this));
796 CObjectTypeInfo(CType<CSubmit_block>()).SetLocalCopyHook(copier, new CCopyHook_Submit_block(this));
797
798 while (true) {
799 if (header.empty()) {
800 header = default_header;
801 }
802 //cout << "Reading " << header << "\n";
803
804 PushNode(eNone);
805
806 if (header == CSeq_submit::GetTypeInfo()->GetName()) {
807 PushNode(eSubmit);
808 copier.Copy(CSeq_submit::GetTypeInfo());
809 PopNode();
810 }
811 else if (header == CSeq_entry::GetTypeInfo()->GetName()) {
812 copier.Copy(CSeq_entry::GetTypeInfo());
813 }
814 else if (header == CBioseq_set::GetTypeInfo()->GetName()) {
815 copier.Copy(CBioseq_set::GetTypeInfo());
816 }
817 else if (header == CBioseq::GetTypeInfo()->GetName()) {
818 copier.Copy(CBioseq::GetTypeInfo());
819 }
820 else {
821 NCBI_THROW(CException, eUnknown, "Unsupported type " + header); // LCOV_EXCL_LINE
822 }
823 PopNode();
824 if (in->EndOfData()) {
825 break;
826 }
827 else {
828 // this will crash if the file is both compressed and concatenated,
829 // but we are not going to support those
830 CNcbiStreampos position = in->GetStreamPos();
831 header = in->ReadFileHeader();
832 in->SetStreamPos(position);
833 }
834 }
835 }
836
837
CanFixBioseq_set(CRefNode & refnode)838 bool CDiscrepancyContext::CanFixBioseq_set(CRefNode& refnode)
839 {
840 if (IsSeqSet(refnode.m_Type)) {
841 CRef<CRefNode> A(&refnode);
842 auto B = m_CurrentNode->m_Ref;
843 while (A && B) {
844 if (A->m_Index != B->m_Index) {
845 return false;
846 }
847 A = A->m_Parent;
848 B = B->m_Parent;
849 if (!A && !B) {
850 return true;
851 }
852 }
853 }
854 return false;
855 }
856
857
CanFixBioseq_set()858 bool CDiscrepancyContext::CanFixBioseq_set()
859 {
860 for (auto* fix : *m_Fixes) {
861 if (CanFixBioseq_set(*fix->m_Fix)) {
862 return true;
863 }
864 }
865 return false;
866 }
867
868
869
CanFixBioseq(CRefNode & refnode)870 bool CDiscrepancyContext::CanFixBioseq(CRefNode& refnode)
871 {
872 if (refnode.m_Type == eBioseq) {
873 CRef<CRefNode> A(&refnode);
874 auto B = m_CurrentNode->m_Ref;
875 while (A && B) {
876 if (A->m_Index != B->m_Index) {
877 return false;
878 }
879 A = A->m_Parent;
880 B = B->m_Parent;
881 if (!A && !B) {
882 return true;
883 }
884 }
885 }
886 return false;
887 }
888
889
CanFixBioseq()890 bool CDiscrepancyContext::CanFixBioseq()
891 {
892 for (auto* fix : *m_Fixes) {
893 if (CanFixBioseq(*fix->m_Fix)) {
894 return true;
895 }
896 }
897 return false;
898 }
899
900
CanFixFeat(CRefNode & refnode)901 bool CDiscrepancyContext::CanFixFeat(CRefNode& refnode)
902 {
903 if (refnode.m_Type == eSeqFeat) {
904 auto A = refnode.m_Parent;
905 auto B = m_CurrentNode->m_Ref;
906 while (A && B) {
907 if (A->m_Index != B->m_Index) {
908 return false;
909 }
910 A = A->m_Parent;
911 B = B->m_Parent;
912 if (!A && !B) {
913 return true;
914 }
915 }
916 }
917 return false;
918 }
919
920
CanFixSeq_annot()921 bool CDiscrepancyContext::CanFixSeq_annot()
922 {
923 for (auto* fix : *m_Fixes) {
924 if (CanFixFeat(*fix->m_Fix)) {
925 return true;
926 }
927 }
928 return false;
929 }
930
931
CanFixDesc(CRefNode & refnode)932 bool CDiscrepancyContext::CanFixDesc(CRefNode& refnode)
933 {
934 if (refnode.m_Type == eSeqDesc) {
935 auto A = refnode.m_Parent;
936 auto B = m_CurrentNode->m_Ref;
937 while (A && B) {
938 if (A->m_Index != B->m_Index) {
939 return false;
940 }
941 A = A->m_Parent;
942 B = B->m_Parent;
943 if (!A && !B) {
944 return true;
945 }
946 }
947 }
948 return false;
949 }
950
951
CanFixSeqdesc()952 bool CDiscrepancyContext::CanFixSeqdesc()
953 {
954 for (auto* fix : *m_Fixes) {
955 if (CanFixDesc(*fix->m_Fix)) {
956 return true;
957 }
958 }
959 return false;
960 }
961
962
CanFixSubmit_block(CRefNode & refnode)963 bool CDiscrepancyContext::CanFixSubmit_block(CRefNode& refnode)
964 {
965 if (refnode.m_Type == eSubmit && m_CurrentNode->m_Ref->m_Type == eSubmit) {
966 CRef<CRefNode> A(&refnode);
967 auto B = m_CurrentNode->m_Ref;
968 while (A && B) {
969 if (A->m_Index != B->m_Index) {
970 return false;
971 }
972 A = A->m_Parent;
973 B = B->m_Parent;
974 if (!A && !B) {
975 return true;
976 }
977 }
978 }
979 return false;
980 }
981
982
CanFixSubmit_block()983 bool CDiscrepancyContext::CanFixSubmit_block()
984 {
985 for (auto* fix : *m_Fixes) {
986 if (CanFixSubmit_block(*fix->m_Fix)) {
987 return true;
988 }
989 }
990 return false;
991 }
992
993
AutofixSeq_annot()994 void CDiscrepancyContext::AutofixSeq_annot()
995 {
996 if (m_AF_Seq_annot->IsFtable()) {
997 for (auto& feat : m_AF_Seq_annot->GetData().GetFtable()) {
998 m_CurrentNode->AddFeature(*feat);
999 }
1000 }
1001
1002 for (auto* fix : *m_Fixes) {
1003 if (CanFixFeat(*fix->m_Fix) && fix->m_Fix->m_Index < m_CurrentNode->m_Features.size()) {
1004 m_NodeMap[&*fix->m_Fix] = m_CurrentNode->m_Features[fix->m_Fix->m_Index];
1005 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1006 }
1007 }
1008 }
1009
1010
AutofixSeq_descr()1011 void CDiscrepancyContext::AutofixSeq_descr()
1012 {
1013 if (m_AF_Seq_descr->CanGet()) {
1014 for (auto& desc : m_AF_Seq_descr->Get()) {
1015 m_CurrentNode->AddDescriptor(*desc);
1016 }
1017 }
1018
1019 for (auto* fix : *m_Fixes) {
1020 if (CanFixDesc(*fix->m_Fix) && fix->m_Fix->m_Index < m_CurrentNode->m_Descriptors.size()) {
1021 m_NodeMap[&*fix->m_Fix] = m_CurrentNode->m_Descriptors[fix->m_Fix->m_Index];
1022 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1023 }
1024 }
1025 }
1026
1027
AutofixSubmit_block()1028 void CDiscrepancyContext::AutofixSubmit_block()
1029 {
1030 CRef<CParseNode> sblock(new CParseNode(eSubmitBlock, 0));
1031 sblock->m_Obj.Reset(static_cast<CObject*>(&*m_AF_Submit_block));
1032
1033 for (auto* fix : *m_Fixes) {
1034 if (CanFixSubmit_block(*fix->m_Fix)) {
1035 m_NodeMap[&*fix->m_Fix] = sblock;
1036 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1037 }
1038 }
1039 }
1040
1041
AutofixBioseq_set()1042 void CDiscrepancyContext::AutofixBioseq_set()
1043 {
1044 const CBioseq_set* bss = static_cast<const CBioseq_set*>(&*m_CurrentNode->m_Obj);
1045 if (bss->CanGetDescr() && bss->GetDescr().CanGet()) {
1046 for (auto& desc : bss->GetDescr().Get()) {
1047 m_CurrentNode->AddDescriptor(*desc);
1048 }
1049 }
1050 if (bss->IsSetAnnot()) {
1051 for (auto& annot : bss->GetAnnot()) {
1052 if (annot->IsFtable()) {
1053 for (auto& feat : annot->GetData().GetFtable()) {
1054 m_CurrentNode->AddFeature(*feat);
1055 }
1056 }
1057 }
1058 }
1059
1060 for (auto& se : bss->GetSeq_set()) {
1061 if (se->IsSet()) {
1062 PushNode(CDiscrepancyContext::eSeqSet);
1063 m_CurrentNode->m_Obj.Reset(&se->GetSet());
1064 AutofixBioseq_set();
1065 }
1066 else {
1067 PushNode(CDiscrepancyContext::eBioseq);
1068 m_CurrentNode->m_Obj.Reset(&se->GetSeq());
1069 AutofixBioseq();
1070 }
1071 PopNode();
1072 }
1073
1074 for (auto* fix : *m_Fixes) {
1075 if (CanFixBioseq_set(*fix->m_Fix)) {
1076 m_NodeMap[&*fix->m_Fix] = m_CurrentNode;
1077 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1078 }
1079 }
1080 }
1081
1082
AutofixBioseq()1083 void CDiscrepancyContext::AutofixBioseq()
1084 {
1085 const CBioseq* bs = static_cast<const CBioseq*>(&*m_CurrentNode->m_Obj);
1086 if (bs->CanGetDescr() && bs->GetDescr().CanGet()) {
1087 for (auto& desc : bs->GetDescr().Get()) {
1088 m_CurrentNode->AddDescriptor(*desc);
1089 }
1090 }
1091 if (bs->IsSetAnnot()) {
1092 for (auto& annot : bs->GetAnnot()) {
1093 if (annot->IsFtable()) {
1094 for (auto& feat : annot->GetData().GetFtable()) {
1095 m_CurrentNode->AddFeature(*feat);
1096 }
1097 }
1098 }
1099 }
1100
1101 for (auto* fix : *m_Fixes) {
1102 if (CanFixBioseq(*fix->m_Fix)) {
1103 m_NodeMap[&*fix->m_Fix] = m_CurrentNode;
1104 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1105 }
1106 }
1107 }
1108
1109
1110 END_SCOPE(NDiscrepancy)
1111 END_NCBI_SCOPE
1112