1 /*  $Id: discrepancy_stream.cpp 627092 2021-03-09 14:28:00Z ivanov $
2  * =========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
33 #include <sstream>
34 #include <objmgr/object_manager.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
36 #include <objmgr/util/sequence.hpp>
37 #include <serial/objcopy.hpp>
38 #include <util/compress/stream_util.hpp>
39 #include <util/line_reader.hpp>
40 #include <util/format_guess.hpp>
41 
42 
43 BEGIN_NCBI_SCOPE
44 BEGIN_SCOPE(NDiscrepancy)
45 USING_SCOPE(objects);
46 
47 
48 static size_t offset = 0;
Offset()49 string Offset() // LCOV_EXCL_START
50 {
51     return string(offset<<1, ' ');
52 } // LCOV_EXCL_STOP
53 
54 class CReadHook_Bioseq_set : public CReadObjectHook
55 {
56 public:
CReadHook_Bioseq_set(CDiscrepancyContext * context)57     CReadHook_Bioseq_set(CDiscrepancyContext* context) : m_Context(context) {}
ReadObject(CObjectIStream & stream,const CObjectInfo & passed_info)58     void ReadObject(CObjectIStream& stream, const CObjectInfo& passed_info) override
59     {
60         if (m_Context->Skip()) {
61             m_Context->PushNode(CDiscrepancyContext::eSeqSet);
62             //cout << Offset() << "Skipping Bioseq_set " << m_Context->m_CurrentNode->m_Index << "\n";
63             m_Context->m_CurrentNode->m_Pos = stream.GetStreamPos();
64             DefaultSkip(stream, passed_info);
65             m_Context->PopNode();
66         }
67         else {
68             bool repeat = m_Context->m_CurrentNode->m_Repeat;
69             m_Context->m_CurrentNode->m_Repeat = false;
70             if (!repeat) {
71                 m_Context->PushNode(CDiscrepancyContext::eSeqSet);
72             }
73             //cout << Offset() << "Reading " << m_Context->m_CurrentNode->Path() << "\n";
74             offset++;
75             DefaultRead(stream, passed_info);
76             offset--;
77             //cout << Offset() << "Done    " << m_Context->m_CurrentNode->Path() << "\n";
78             m_Context->m_CurrentNode->m_Obj.Reset((CObject*)passed_info.GetObjectPtr());
79             if (!repeat) {
80                 m_Context->PopNode();
81             }
82         }
83     }
84 protected:
85     CDiscrepancyContext* m_Context;
86 };
87 
88 
89 class CReadHook_Bioseq : public CReadObjectHook
90 {
91 public:
CReadHook_Bioseq(CDiscrepancyContext * context)92     CReadHook_Bioseq(CDiscrepancyContext* context) : m_Context(context) {}
ReadObject(CObjectIStream & stream,const CObjectInfo & passed_info)93     void ReadObject(CObjectIStream& stream, const CObjectInfo& passed_info) override
94     {
95         if (m_Context->Skip()) {
96             m_Context->PushNode(CDiscrepancyContext::eBioseq);
97             //cout << Offset() << "Skipping Bioseq " << m_Context->m_CurrentNode->m_Index << "\n";
98             m_Context->m_CurrentNode->m_Pos = stream.GetStreamPos();
99             DefaultSkip(stream, passed_info);
100             m_Context->PopNode();
101         }
102         else {
103             bool repeat = m_Context->m_CurrentNode->m_Repeat;
104             m_Context->m_CurrentNode->m_Repeat = false;
105             if (!repeat) {
106                 m_Context->PushNode(CDiscrepancyContext::eBioseq);
107             }
108             //cout << Offset() << "Reading " << m_Context->m_CurrentNode->Path() << "\n";
109             DefaultRead(stream, passed_info);
110             m_Context->m_CurrentNode->m_Obj.Reset((CObject*)passed_info.GetObjectPtr());
111             if (!repeat) {
112                 m_Context->PopNode();
113             }
114         }
115     }
116 protected:
117     CDiscrepancyContext* m_Context;
118 };
119 
120 
121 class CReadHook_Bioseq_set_class : public CReadClassMemberHook
122 {
123 public:
CReadHook_Bioseq_set_class(CDiscrepancyContext * context)124     CReadHook_Bioseq_set_class(CDiscrepancyContext* context) : m_Context(context) {}
ReadClassMember(CObjectIStream & stream,const CObjectInfoMI & passed_info)125     void ReadClassMember(CObjectIStream& stream, const CObjectInfoMI& passed_info) override
126     {
127         DefaultRead(stream, passed_info);
128         const CBioseq_set::TClass& cl = *(const CBioseq_set::TClass*)passed_info.GetMember().GetObjectPtr();
129         switch (cl) {
130             case CBioseq_set::eClass_nuc_prot:
131                 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_NucProt);
132                 break;
133             case CBioseq_set::eClass_gen_prod_set:
134                 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_GenProd);
135                 break;
136             case CBioseq_set::eClass_segset:
137                 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_SegSet);
138                 break;
139             case CBioseq_set::eClass_small_genome_set:
140                 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_Genome);
141                 break;
142             case CBioseq_set::eClass_eco_set:
143             case CBioseq_set::eClass_mut_set:
144             case CBioseq_set::eClass_phy_set:
145             case CBioseq_set::eClass_pop_set:
146                 m_Context->m_CurrentNode->SetType(CDiscrepancyContext::eSeqSet_Funny);
147                 break;
148             default:
149                 break;
150         }
151     }
152 
153 protected:
154     CDiscrepancyContext* m_Context;
155 };
156 
157 
PushNode(EObjType type)158 void CDiscrepancyContext::PushNode(EObjType type)
159 {
160     CRef<CParseNode> new_node(new CParseNode(type, (unsigned)m_CurrentNode->m_Children.size(), m_CurrentNode));
161     m_CurrentNode->m_Children.push_back(new_node);
162     m_CurrentNode.Reset(new_node);
163 }
164 
165 
Skip()166 bool CDiscrepancyContext::Skip()
167 {
168     // Not skipping the first child or nuc-prot children
169     return m_Skip && m_CurrentNode->m_Type == eSeqSet && !m_CurrentNode->m_Repeat && m_CurrentNode->m_Children.size();
170 }
171 
172 
ParseStrings(const string & fname)173 void CDiscrepancyContext::ParseStrings(const string& fname)
174 {
175     m_RootNode.Reset(new CParseNode(eFile, 0));
176     m_RootNode->m_Ref->m_Text = fname;
177     m_CurrentNode.Reset(m_RootNode);
178 
179     CNcbiIfstream istr(fname);
180     CStreamLineReader line_reader(istr);
181     do {
182         PushNode(eString);
183         m_CurrentNode->m_Ref->m_Text = *++line_reader;
184         RunTests();
185         PopNode();
186     }
187     while (!line_reader.AtEOF());
188 }
189 
190 
ParseStream(CObjectIStream & stream,const string & fname,bool skip,const string & default_header)191 void CDiscrepancyContext::ParseStream(CObjectIStream& stream, const string& fname, bool skip, const string& default_header)
192 {
193     m_Skip = skip;
194     CObjectTypeInfo(CType<CBioseq_set>()).SetLocalReadHook(stream, new CReadHook_Bioseq_set(this));
195     CObjectTypeInfo(CType<CBioseq_set>()).FindMember("class").SetLocalReadHook(stream, new CReadHook_Bioseq_set_class(this));
196     CObjectTypeInfo(CType<CBioseq>()).SetLocalReadHook(stream, new CReadHook_Bioseq(this));
197 
198     m_RootNode.Reset(new CParseNode(eFile, 0));
199     m_RootNode->m_Ref->m_Text = fname;
200     m_CurrentNode.Reset(m_RootNode);
201 
202     while (true) {
203         string header = stream.ReadFileHeader();
204         if (header.empty()) {
205             header = default_header;
206         }
207         //cout << "Reading " << header << "\n";
208         PushNode(eNone);
209 
210         if (header == CSeq_submit::GetTypeInfo()->GetName()) {
211             PushNode(eSubmit);
212             CRef<CSeq_submit> ss(new CSeq_submit);
213             stream.Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
214             m_CurrentNode->m_Obj.Reset(ss);
215             PopNode();
216         }
217         else if (header == CSeq_entry::GetTypeInfo()->GetName()) {
218             CRef<CSeq_entry> se(new CSeq_entry);
219             stream.Read(ObjectInfo(*se), CObjectIStream::eNoFileHeader);
220         }
221         else if (header == CBioseq_set::GetTypeInfo()->GetName()) {
222             CRef<CBioseq_set> set(new CBioseq_set);
223             stream.Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
224         }
225         else if (header == CBioseq::GetTypeInfo()->GetName()) {
226             CRef<CBioseq> seq(new CBioseq);
227             stream.Read(ObjectInfo(*seq), CObjectIStream::eNoFileHeader);
228         }
229         else {
230             NCBI_THROW(CException, eUnknown, "Unsupported type " + header); // LCOV_EXCL_LINE
231         }
232         CNcbiStreampos position = stream.GetStreamPos();
233         Extend(*m_CurrentNode, stream);
234         if (m_Skip) {
235             stream.SetStreamPos(position);
236         }
237         PopNode();
238         if (stream.EndOfData()) {
239             break;
240         }
241     }
242 }
243 
244 
Extend(CParseNode & node,CObjectIStream & stream)245 void CDiscrepancyContext::Extend(CParseNode& node, CObjectIStream& stream)
246 {
247 //cout << "Reading " << node.Path() << "\n";
248     bool load = (node.m_Type == eSeqSet_NucProt && !InGenProdSet(&node)) || (node.m_Type == eSeqSet_GenProd && !InNucProtSet(&node)) || (node.m_Type == eBioseq && !InNucProtSet(&node) && !InGenProdSet(&node));
249     if (load) {
250         CRef<CSeq_entry> se(new CSeq_entry());
251         if (node.m_Type == eBioseq) {
252             se->SetSeq((CBioseq&)*node.m_Obj);
253         }
254         else { // node.m_Type == eSeqSet_NucProt
255             se->SetSet((CBioseq_set&)*node.m_Obj);
256         }
257         auto handle = m_Scope->AddTopLevelSeqEntry(*se);
258         m_FeatTree.Reset(new feature::CFeatTree(handle));
259     }
260     Populate(node);
261 
262     for (size_t i = 0; i < node.m_Children.size(); i++) {
263         CParseNode& item = *node.m_Children[i];
264         if (!item.m_Obj) {
265             stream.SetStreamPos(item.m_Pos);
266             item.m_Repeat = true;
267             m_CurrentNode.Reset(&item);
268             if (item.m_Type == eBioseq) {
269                 CRef<CBioseq> seq(new CBioseq);
270                 stream.Read(ObjectInfo(*seq), CObjectIStream::eNoFileHeader);
271             }
272             else if (item.m_Type == eSeqSet) {
273                 CRef<CBioseq_set> set(new CBioseq_set);
274                 stream.Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
275             }
276         }
277         Extend(item, stream);
278         if (node.m_Type != eSeqSet_NucProt && node.m_Type != eSeqSet_GenProd) {
279             node.m_Children[i].Reset();
280         }
281     }
282 
283 //cout << "Running tests on " << node.Path() << " ...\n";
284     m_CurrentNode.Reset(&node);
285     RunTests();
286 
287     if (load) {
288         //m_FeatTree.Reset();
289         m_Scope->ResetDataAndHistory();
290     }
291 }
292 
293 
ParseObject(const CBioseq & root)294 void CDiscrepancyContext::ParseObject(const CBioseq& root)
295 {
296     CRef<CParseNode> current = m_CurrentNode;
297     PushNode(eBioseq);
298     m_CurrentNode->m_Obj.Reset(&root);
299     m_CurrentNode = current;
300 }
301 
302 
ParseObject(const CBioseq_set & root)303 void CDiscrepancyContext::ParseObject(const CBioseq_set& root)
304 {
305     CRef<CParseNode> current = m_CurrentNode;
306     EObjType type = eSeqSet;
307     if (root.IsSetClass()) {
308         switch (root.GetClass()) {
309             case CBioseq_set::eClass_nuc_prot:
310                 type = eSeqSet_NucProt;
311                 break;
312             case CBioseq_set::eClass_gen_prod_set:
313                 type = eSeqSet_GenProd;
314                 break;
315             case CBioseq_set::eClass_segset:
316                 type = eSeqSet_SegSet;
317                 break;
318             case CBioseq_set::eClass_small_genome_set:
319                 type = eSeqSet_Genome;
320                 break;
321             case CBioseq_set::eClass_eco_set:
322             case CBioseq_set::eClass_mut_set:
323             case CBioseq_set::eClass_phy_set:
324             case CBioseq_set::eClass_pop_set:
325                 type = eSeqSet_Funny;
326                 break;
327             default:
328                 break;
329         }
330     }
331     PushNode(type);
332     m_CurrentNode->m_Obj.Reset(&root);
333     if (root.CanGetSeq_set()) {
334         for (const auto& entry : root.GetSeq_set()) {
335             ParseObject(*entry);
336         }
337     }
338     m_CurrentNode = current;
339 }
340 
341 
ParseObject(const CSeq_entry & root)342 void CDiscrepancyContext::ParseObject(const CSeq_entry& root)
343 {
344     if (root.IsSet()) {
345         ParseObject(root.GetSet());
346     }
347     else if (root.IsSeq()) {
348         ParseObject(root.GetSeq());
349     }
350 }
351 
352 
ParseObject(const CSeq_submit & root)353 void CDiscrepancyContext::ParseObject(const CSeq_submit& root)
354 {
355     CRef<CParseNode> current = m_CurrentNode;
356     PushNode(eSubmit);
357     m_CurrentNode->m_Obj.Reset(&root);
358     if (root.CanGetData() && root.GetData().IsEntrys()) {
359         for (const auto& entry : root.GetData().GetEntrys()) {
360             ParseObject(*entry);
361         }
362     }
363     m_CurrentNode = current;
364 }
365 
366 
ParseAll(CParseNode & node)367 void CDiscrepancyContext::ParseAll(CParseNode& node)
368 {
369     Populate(node);
370     for (auto& item : node.m_Children) {
371         ParseAll(*item);
372     }
373     m_CurrentNode.Reset(&node);
374     RunTests();
375 }
376 
377 
Populate(CParseNode & node)378 void CDiscrepancyContext::Populate(CParseNode& node)
379 {
380     switch (node.m_Type) {
381         case eSeqSet:
382         case eSeqSet_NucProt:
383         case eSeqSet_GenProd:
384         case eSeqSet_SegSet:
385         case eSeqSet_Genome:
386         case eSeqSet_Funny:
387             PopulateSeqSet(node);
388             break;
389         case eBioseq:
390             PopulateBioseq(node);
391             break;
392         case eSubmit:
393             PopulateSubmit(node);
394             break;
395         default:
396             break;
397     }
398 }
399 
400 
PopulateSubmit(CParseNode & node)401 void CDiscrepancyContext::PopulateSubmit(CParseNode& node)
402 {
403     const CSeq_submit& sub = dynamic_cast<const CSeq_submit&>(*node.m_Obj);
404     if (sub.IsSetSub()) {
405         if (sub.GetSub().IsSetCit() && sub.GetSub().GetCit().CanGetAuthors()) {
406             const CAuth_list* auth = &sub.GetSub().GetCit().GetAuthors();
407             node.m_Authors.push_back(auth);
408             node.m_AuthorMap[auth] = &node;
409         }
410     }
411 }
412 
413 
PopulateBioseq(CParseNode & node)414 void CDiscrepancyContext::PopulateBioseq(CParseNode& node)
415 {
416     const CBioseq& bioseq = dynamic_cast<const CBioseq&>(*node.m_Obj);
417     if (bioseq.CanGetDescr() && bioseq.GetDescr().CanGet()) {
418         for (const auto& desc : bioseq.GetDescr().Get()) {
419             node.AddDescriptor(*desc);
420             if (desc->IsMolinfo()) {
421                 node.m_Molinfo.Reset(desc);
422             }
423             else if (desc->IsSource()) {
424                 node.m_Biosource.Reset(desc);
425             }
426             else if (desc->IsTitle()) {
427                 node.m_Title.Reset(desc);
428             }
429         }
430     }
431     if (bioseq.IsSetAnnot()) {
432         for (const auto& annot : bioseq.GetAnnot()) {
433             if (annot->IsFtable()) {
434                 for (const auto& feat : annot->GetData().GetFtable()) {
435                     node.AddFeature(*feat);
436                 }
437             }
438         }
439     }
440     node.m_BioseqSummary.reset(new CSeqSummary());
441     BuildSeqSummary(bioseq, *node.m_BioseqSummary);
442     string label = node.m_BioseqSummary->Label;
443     node.m_Ref->m_Text = node.m_BioseqSummary->Label + "\n" + node.m_BioseqSummary->GetStats();
444     for (CParseNode* n = node.m_Parent; n; n = n->m_Parent) {
445         if ((!IsSeqSet(n->m_Type) && n->m_Type != eSubmit) || !n->m_Ref->m_Text.empty()) {
446             break;
447         }
448         n->m_Ref->m_Text = n->m_Type == eSeqSet_NucProt || n->m_Type == eSeqSet_SegSet ? node.m_BioseqSummary->Label : label;
449         label = n->m_Ref->GetText();
450     }
451     if (node.m_Biosource) {
452         for (CParseNode* n = node.m_Parent; n && IsSeqSet(n->m_Type); n = n->m_Parent) {
453             if (n->m_Type == eSeqSet_Genome || n->m_Type == eSeqSet_Funny) {
454                 n->m_SetBiosources.push_back(node.m_Biosource);
455             }
456         }
457     }
458 }
459 
460 
PopulateSeqSet(CParseNode & node)461 void CDiscrepancyContext::PopulateSeqSet(CParseNode& node)
462 {
463     const CBioseq_set& seqset = dynamic_cast<const CBioseq_set&>(*node.m_Obj);
464     if (seqset.CanGetDescr() && seqset.GetDescr().CanGet()) {
465         for (const auto& desc : seqset.GetDescr().Get()) {
466             node.AddDescriptor(*desc);
467             if (desc->IsMolinfo()) {
468                 node.m_Molinfo.Reset(desc);
469             }
470             else if (desc->IsSource()) {
471                 node.m_Biosource.Reset(desc);
472             }
473             else if (desc->IsTitle()) {
474                 node.m_Title.Reset(desc);
475             }
476         }
477     }
478     if (seqset.IsSetAnnot()) {
479         for (const auto& annot : seqset.GetAnnot()) {
480             if (annot->IsFtable()) {
481                 for (const auto& feat : annot->GetData().GetFtable()) {
482                     node.AddFeature(*feat);
483                 }
484             }
485         }
486     }
487     if (node.m_Biosource) {
488         for (CParseNode* n = &node; n && IsSeqSet(n->m_Type); n = n->m_Parent) {
489             if (n->m_Type == eSeqSet_Genome) {
490                 n->m_SetBiosources.push_back(node.m_Biosource);
491             }
492         }
493     }
494 }
495 
496 
FindNode(const CRefNode & ref)497 CDiscrepancyContext::CParseNode* CDiscrepancyContext::FindNode(const CRefNode& ref)
498 {
499     auto it = m_NodeMap.find(&ref);
500     if (it != m_NodeMap.end()) {
501         return it->second;
502     }
503     if (ref.m_Parent) {
504         CParseNode* p = FindNode(*ref.m_Parent);
505         if (p) {
506             switch (ref.m_Type) {
507                 case eSeqFeat:
508                     m_NodeMap[&ref] = p->m_Features[ref.m_Index];
509                     break;
510                 case eSeqDesc:
511                     m_NodeMap[&ref] = p->m_Descriptors[ref.m_Index];
512                     break;
513                 default:
514                     m_NodeMap[&ref] = p->m_Children[ref.m_Index];
515                     break;
516             }
517             return m_NodeMap[&ref];
518         }
519     }
520     return nullptr;
521 }
522 
523 
FindObject(CReportObj & obj,bool alt)524 const CSerialObject* CDiscrepancyContext::FindObject(CReportObj& obj, bool alt)
525 {
526     CDiscrepancyObject& p = static_cast<CDiscrepancyObject&>(obj);
527     CParseNode* node = FindNode(alt ? *p.m_Fix : *p.m_Ref);
528     return node ? dynamic_cast<const CSerialObject*>(&*node->m_Obj) : nullptr;
529 }
530 
531 
ReplaceObject(CReportObj & obj,CSerialObject * ser,bool alt)532 void CDiscrepancyContext::ReplaceObject(CReportObj& obj, CSerialObject* ser, bool alt)
533 {
534     CDiscrepancyObject* p = static_cast<CDiscrepancyObject*>(&obj);
535     CParseNode* node = FindNode(alt ? *p->m_Fix : *p->m_Ref);
536     node->m_Obj.Reset(ser);
537 }
538 
539 
ReplaceSeq_feat(CReportObj & obj,const CSeq_feat & old_feat,CSeq_feat & new_feat,bool alt)540 void CDiscrepancyContext::ReplaceSeq_feat(CReportObj& obj, const CSeq_feat& old_feat, CSeq_feat& new_feat, bool alt)
541 {
542     if (m_AF_Seq_annot) {
543         auto& ftable = m_AF_Seq_annot->SetData().SetFtable();
544         for (auto& feat : ftable) {
545             if (&*feat == &old_feat) {
546                 feat.Reset(&new_feat);
547             }
548         }
549     }
550     else {
551         CSeq_feat_EditHandle feh(GetScope().GetSeq_featHandle(old_feat));
552         feh.Replace(new_feat);
553     }
554     ReplaceObject(obj, &new_feat, alt);
555 }
556 
557 
558 // AUTOFIX ////////////////////////////////////////////////////////////////////////
559 
560 class CCopyHook_Bioseq_set : public CCopyObjectHook
561 {
562 public:
CCopyHook_Bioseq_set(CDiscrepancyContext * context)563     CCopyHook_Bioseq_set(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)564     void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
565     {
566         m_Context->PushNode(CDiscrepancyContext::eSeqSet);
567         if (m_Context->CanFixBioseq_set()) {
568             m_Context->m_AF_Bioseq_set.Reset(new CBioseq_set);
569             copier.In().ReadObject(m_Context->m_AF_Bioseq_set, passed_info.GetTypeInfo());
570             m_Context->m_CurrentNode->m_Obj.Reset(m_Context->m_AF_Bioseq_set);
571             CRef<CSeq_entry> se(new CSeq_entry());
572             se->SetSet(*m_Context->m_AF_Bioseq_set);
573             auto handle = m_Context->m_Scope->AddTopLevelSeqEntry(*se);
574             m_Context->m_FeatTree.Reset(new feature::CFeatTree(handle));
575             m_Context->AutofixBioseq_set();
576             copier.Out().WriteObject(m_Context->m_AF_Bioseq_set, passed_info.GetTypeInfo());
577             m_Context->m_AF_Bioseq_set.Reset();
578         }
579         else {
580             DefaultCopy(copier, passed_info);
581         }
582         m_Context->PopNode();
583     }
584 protected:
585     CDiscrepancyContext* m_Context;
586 };
587 
588 
589 class CCopyHook_Bioseq : public CCopyObjectHook
590 {
591 public:
CCopyHook_Bioseq(CDiscrepancyContext * context)592     CCopyHook_Bioseq(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)593     void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
594     {
595         m_Context->PushNode(CDiscrepancyContext::eBioseq);
596         if (m_Context->CanFixBioseq()) {
597             m_Context->m_AF_Bioseq.Reset(new CBioseq);
598             copier.In().ReadObject(m_Context->m_AF_Bioseq, passed_info.GetTypeInfo());
599             m_Context->m_CurrentNode->m_Obj.Reset(m_Context->m_AF_Bioseq);
600             CRef<CSeq_entry> se(new CSeq_entry());
601             se->SetSeq(*m_Context->m_AF_Bioseq);
602             auto handle = m_Context->m_Scope->AddTopLevelSeqEntry(*se);
603             m_Context->m_FeatTree.Reset(new feature::CFeatTree(handle));
604             m_Context->AutofixBioseq();
605             copier.Out().WriteObject(m_Context->m_AF_Bioseq, passed_info.GetTypeInfo());
606             m_Context->m_AF_Bioseq.Reset();
607         }
608         else {
609             DefaultCopy(copier, passed_info);
610         }
611         m_Context->PopNode();
612     }
613 protected:
614     CDiscrepancyContext* m_Context;
615 };
616 
617 
618 class CCopyHook_Seq_descr : public CCopyObjectHook
619 {
620 public:
CCopyHook_Seq_descr(CDiscrepancyContext * context)621     CCopyHook_Seq_descr(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)622     void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
623     {
624         if (m_Context->CanFixSeqdesc()) {
625             m_Context->m_AF_Seq_descr.Reset(new CSeq_descr);
626             copier.In().ReadObject(m_Context->m_AF_Seq_descr, passed_info.GetTypeInfo());
627             m_Context->AutofixSeq_descr();
628             copier.Out().WriteObject(m_Context->m_AF_Seq_descr, passed_info.GetTypeInfo());
629             m_Context->m_AF_Seq_descr.Reset();
630         }
631         else {
632             DefaultCopy(copier, passed_info);
633         }
634     }
635 protected:
636     CDiscrepancyContext* m_Context;
637 };
638 
639 
640 class CCopyHook_Seq_annot : public CCopyObjectHook
641 {
642 public:
CCopyHook_Seq_annot(CDiscrepancyContext * context)643     CCopyHook_Seq_annot(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)644     void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
645     {
646         if (m_Context->CanFixSeq_annot()) {
647             m_Context->m_AF_Seq_annot.Reset(new CSeq_annot);
648             copier.In().ReadObject(m_Context->m_AF_Seq_annot, passed_info.GetTypeInfo());
649             m_Context->AutofixSeq_annot();
650             copier.Out().WriteObject(m_Context->m_AF_Seq_annot, passed_info.GetTypeInfo());
651             m_Context->m_AF_Seq_annot.Reset();
652         }
653         else {
654             DefaultCopy(copier, passed_info);
655         }
656     }
657 protected:
658     CDiscrepancyContext* m_Context;
659 };
660 
661 
662 class CCopyHook_Submit_block : public CCopyObjectHook
663 {
664 public:
CCopyHook_Submit_block(CDiscrepancyContext * context)665     CCopyHook_Submit_block(CDiscrepancyContext* context) : m_Context(context) {}
CopyObject(CObjectStreamCopier & copier,const CObjectTypeInfo & passed_info)666     void CopyObject(CObjectStreamCopier& copier, const CObjectTypeInfo& passed_info) override
667     {
668         if (m_Context->CanFixSubmit_block()) {
669             m_Context->m_AF_Submit_block.Reset(new CSubmit_block);
670             copier.In().ReadObject(m_Context->m_AF_Submit_block, passed_info.GetTypeInfo());
671             m_Context->AutofixSubmit_block();
672             copier.Out().WriteObject(m_Context->m_AF_Submit_block, passed_info.GetTypeInfo());
673             m_Context->m_AF_Submit_block.Reset();
674         }
675         else {
676             DefaultCopy(copier, passed_info);
677         }
678     }
679 protected:
680     CDiscrepancyContext* m_Context;
681 };
682 
683 
OpenUncompressedStream(const string & fname,bool & compressed)684 unique_ptr<CObjectIStream> OpenUncompressedStream(const string& fname, bool& compressed) // One more copy!!!
685 {
686     unique_ptr<CNcbiIstream> InputStream(new CNcbiIfstream(fname, ios::binary));
687     CCompressStream::EMethod method;
688 
689     CFormatGuess::EFormat format = CFormatGuess::Format(*InputStream);
690     switch (format) {
691         case CFormatGuess::eGZip:  method = CCompressStream::eGZipFile;  break;
692         case CFormatGuess::eBZip2: method = CCompressStream::eBZip2;     break;
693         case CFormatGuess::eLzo:   method = CCompressStream::eLZO;       break;
694         default:                   method = CCompressStream::eNone;      break;
695     }
696     compressed = method != CCompressStream::eNone;
697     if (compressed) {
698         InputStream.reset(new CDecompressIStream(*InputStream.release(), method, CCompressStream::fDefault, eTakeOwnership));
699         format = CFormatGuess::Format(*InputStream);
700     }
701 
702     unique_ptr<CObjectIStream> objectStream;
703     switch (format)
704     {
705         case CFormatGuess::eBinaryASN:
706         case CFormatGuess::eTextASN:
707             objectStream.reset(CObjectIStream::Open(format == CFormatGuess::eBinaryASN ? eSerial_AsnBinary : eSerial_AsnText, *InputStream.release(), eTakeOwnership));
708             break;
709         default:
710             break;
711     }
712     objectStream->SetDelayBufferParsingPolicy(CObjectIStream::eDelayBufferPolicyAlwaysParse);
713     return objectStream;
714 }
715 
716 
Autofix(TReportObjectList & tofix,map<string,size_t> & rep,const string & default_header)717 void CDiscrepancyContext::Autofix(TReportObjectList& tofix, map<string, size_t>& rep, const string& default_header)
718 {
719     if (!tofix.empty()) {
720         sort(tofix.begin(), tofix.end(), CompareRefs);
721         bool in_file = false;
722         for (const CRefNode* node = static_cast<CDiscrepancyObject&>(*tofix[0]).m_Fix; node; node = node->m_Parent) {
723             if (node->m_Type == eFile) in_file = true;
724         }
725         if (!in_file) { // GBench etc. -- all objects already in the scope
726             for (auto& fix : tofix) {
727                 CDiscrepancyObject& obj = static_cast<CDiscrepancyObject&>(*fix);
728                 CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*obj.m_Case).Autofix(&obj, *this);
729                 if (result) {
730                     rep[result->GetS()] += result->GetN();
731                 }
732             }
733             return;
734         }
735 
736         vector<vector<CDiscrepancyObject*>> all_fixes;
737         string current_path;
738         for (auto& fix : tofix) {
739             string path;
740             CDiscrepancyObject& obj = static_cast<CDiscrepancyObject&>(*fix);
741             for (const CRefNode* node = obj.m_Fix; node; node = node->m_Parent) {
742                 if (node->m_Type == eFile) {
743                     path = node->m_Text;
744                     break;
745                 }
746             }
747             if (path != current_path) {
748                 current_path = path;
749                 vector<CDiscrepancyObject*> fixes;
750                 all_fixes.push_back(fixes);
751             }
752             all_fixes.back().push_back(&obj);
753         }
754         for (auto& fix : all_fixes) {
755             AutofixFile(fix, default_header);
756         }
757     }
758 }
759 
760 
AutofixFile(vector<CDiscrepancyObject * > & fixes,const string & default_header)761 void CDiscrepancyContext::AutofixFile(vector<CDiscrepancyObject*>&fixes, const string& default_header)
762 {
763     string path;
764     for (CRefNode* node = fixes[0]->m_Fix; node; node = node->m_Parent) {
765         if (node->m_Type == eFile) {
766             path = node->m_Text;
767             break;
768         }
769     }
770     bool compressed = false;
771     unique_ptr<CObjectIStream> in = OpenUncompressedStream(path, compressed);
772     cout << "Autofixing " << path << "\n";
773 
774     size_t dot = path.find_last_of('.');
775     if (dot != string::npos) {
776         size_t slash = path.find_last_of("/\\");
777         if (slash != string::npos && slash >= dot) {
778             dot = string::npos;
779         }
780     }
781     string fixed_path = !compressed && (dot != string::npos) ? path.substr(0, dot) + ".autofix" + path.substr(dot) : path + ".autofix.sqn";
782 
783     string header = in->ReadFileHeader();
784     in = OpenUncompressedStream(path, compressed);
785     unique_ptr<CObjectOStream> out(CObjectOStream::Open(eSerial_AsnText, fixed_path));
786     CObjectStreamCopier copier(*in, *out);
787 
788     m_Fixes = &fixes;
789     m_RootNode.Reset(new CParseNode(eFile, 0));
790     m_CurrentNode.Reset(m_RootNode);
791 
792     CObjectTypeInfo(CType<CBioseq_set>()).SetLocalCopyHook(copier, new CCopyHook_Bioseq_set(this));
793     CObjectTypeInfo(CType<CBioseq>()).SetLocalCopyHook(copier, new CCopyHook_Bioseq(this));
794     CObjectTypeInfo(CType<CSeq_descr>()).SetLocalCopyHook(copier, new CCopyHook_Seq_descr(this));
795     CObjectTypeInfo(CType<CSeq_annot>()).SetLocalCopyHook(copier, new CCopyHook_Seq_annot(this));
796     CObjectTypeInfo(CType<CSubmit_block>()).SetLocalCopyHook(copier, new CCopyHook_Submit_block(this));
797 
798     while (true) {
799         if (header.empty()) {
800             header = default_header;
801         }
802         //cout << "Reading " << header << "\n";
803 
804         PushNode(eNone);
805 
806         if (header == CSeq_submit::GetTypeInfo()->GetName()) {
807             PushNode(eSubmit);
808             copier.Copy(CSeq_submit::GetTypeInfo());
809             PopNode();
810         }
811         else if (header == CSeq_entry::GetTypeInfo()->GetName()) {
812             copier.Copy(CSeq_entry::GetTypeInfo());
813         }
814         else if (header == CBioseq_set::GetTypeInfo()->GetName()) {
815             copier.Copy(CBioseq_set::GetTypeInfo());
816         }
817         else if (header == CBioseq::GetTypeInfo()->GetName()) {
818             copier.Copy(CBioseq::GetTypeInfo());
819         }
820         else {
821             NCBI_THROW(CException, eUnknown, "Unsupported type " + header); // LCOV_EXCL_LINE
822         }
823         PopNode();
824         if (in->EndOfData()) {
825             break;
826         }
827         else {
828             // this will crash if the file is both compressed and concatenated,
829             // but we are not going to support those
830             CNcbiStreampos position = in->GetStreamPos();
831             header = in->ReadFileHeader();
832             in->SetStreamPos(position);
833         }
834     }
835 }
836 
837 
CanFixBioseq_set(CRefNode & refnode)838 bool CDiscrepancyContext::CanFixBioseq_set(CRefNode& refnode)
839 {
840     if (IsSeqSet(refnode.m_Type)) {
841         CRef<CRefNode> A(&refnode);
842         auto B = m_CurrentNode->m_Ref;
843         while (A && B) {
844             if (A->m_Index != B->m_Index) {
845                 return false;
846             }
847             A = A->m_Parent;
848             B = B->m_Parent;
849             if (!A && !B) {
850                 return true;
851             }
852         }
853     }
854     return false;
855 }
856 
857 
CanFixBioseq_set()858 bool CDiscrepancyContext::CanFixBioseq_set()
859 {
860     for (auto* fix : *m_Fixes) {
861         if (CanFixBioseq_set(*fix->m_Fix)) {
862             return true;
863         }
864     }
865     return false;
866 }
867 
868 
869 
CanFixBioseq(CRefNode & refnode)870 bool CDiscrepancyContext::CanFixBioseq(CRefNode& refnode)
871 {
872     if (refnode.m_Type == eBioseq) {
873         CRef<CRefNode> A(&refnode);
874         auto B = m_CurrentNode->m_Ref;
875         while (A && B) {
876             if (A->m_Index != B->m_Index) {
877                 return false;
878             }
879             A = A->m_Parent;
880             B = B->m_Parent;
881             if (!A && !B) {
882                 return true;
883             }
884         }
885     }
886     return false;
887 }
888 
889 
CanFixBioseq()890 bool CDiscrepancyContext::CanFixBioseq()
891 {
892     for (auto* fix : *m_Fixes) {
893         if (CanFixBioseq(*fix->m_Fix)) {
894             return true;
895         }
896     }
897     return false;
898 }
899 
900 
CanFixFeat(CRefNode & refnode)901 bool CDiscrepancyContext::CanFixFeat(CRefNode& refnode)
902 {
903     if (refnode.m_Type == eSeqFeat) {
904         auto A = refnode.m_Parent;
905         auto B = m_CurrentNode->m_Ref;
906         while (A && B) {
907             if (A->m_Index != B->m_Index) {
908                 return false;
909             }
910             A = A->m_Parent;
911             B = B->m_Parent;
912             if (!A && !B) {
913                 return true;
914             }
915         }
916     }
917     return false;
918 }
919 
920 
CanFixSeq_annot()921 bool CDiscrepancyContext::CanFixSeq_annot()
922 {
923     for (auto* fix : *m_Fixes) {
924         if (CanFixFeat(*fix->m_Fix)) {
925             return true;
926         }
927     }
928     return false;
929 }
930 
931 
CanFixDesc(CRefNode & refnode)932 bool CDiscrepancyContext::CanFixDesc(CRefNode& refnode)
933 {
934     if (refnode.m_Type == eSeqDesc) {
935         auto A = refnode.m_Parent;
936         auto B = m_CurrentNode->m_Ref;
937         while (A && B) {
938             if (A->m_Index != B->m_Index) {
939                 return false;
940             }
941             A = A->m_Parent;
942             B = B->m_Parent;
943             if (!A && !B) {
944                 return true;
945             }
946         }
947     }
948     return false;
949 }
950 
951 
CanFixSeqdesc()952 bool CDiscrepancyContext::CanFixSeqdesc()
953 {
954     for (auto* fix : *m_Fixes) {
955         if (CanFixDesc(*fix->m_Fix)) {
956             return true;
957         }
958     }
959     return false;
960 }
961 
962 
CanFixSubmit_block(CRefNode & refnode)963 bool CDiscrepancyContext::CanFixSubmit_block(CRefNode& refnode)
964 {
965     if (refnode.m_Type == eSubmit && m_CurrentNode->m_Ref->m_Type == eSubmit) {
966         CRef<CRefNode> A(&refnode);
967         auto B = m_CurrentNode->m_Ref;
968         while (A && B) {
969             if (A->m_Index != B->m_Index) {
970                 return false;
971             }
972             A = A->m_Parent;
973             B = B->m_Parent;
974             if (!A && !B) {
975                 return true;
976             }
977         }
978     }
979     return false;
980 }
981 
982 
CanFixSubmit_block()983 bool CDiscrepancyContext::CanFixSubmit_block()
984 {
985     for (auto* fix : *m_Fixes) {
986         if (CanFixSubmit_block(*fix->m_Fix)) {
987             return true;
988         }
989     }
990     return false;
991 }
992 
993 
AutofixSeq_annot()994 void CDiscrepancyContext::AutofixSeq_annot()
995 {
996     if (m_AF_Seq_annot->IsFtable()) {
997         for (auto& feat : m_AF_Seq_annot->GetData().GetFtable()) {
998             m_CurrentNode->AddFeature(*feat);
999         }
1000     }
1001 
1002     for (auto* fix : *m_Fixes) {
1003         if (CanFixFeat(*fix->m_Fix) && fix->m_Fix->m_Index < m_CurrentNode->m_Features.size()) {
1004             m_NodeMap[&*fix->m_Fix] = m_CurrentNode->m_Features[fix->m_Fix->m_Index];
1005             CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1006         }
1007     }
1008 }
1009 
1010 
AutofixSeq_descr()1011 void CDiscrepancyContext::AutofixSeq_descr()
1012 {
1013     if (m_AF_Seq_descr->CanGet()) {
1014         for (auto& desc : m_AF_Seq_descr->Get()) {
1015             m_CurrentNode->AddDescriptor(*desc);
1016         }
1017     }
1018 
1019     for (auto* fix : *m_Fixes) {
1020         if (CanFixDesc(*fix->m_Fix) && fix->m_Fix->m_Index < m_CurrentNode->m_Descriptors.size()) {
1021             m_NodeMap[&*fix->m_Fix] = m_CurrentNode->m_Descriptors[fix->m_Fix->m_Index];
1022             CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1023         }
1024     }
1025 }
1026 
1027 
AutofixSubmit_block()1028 void CDiscrepancyContext::AutofixSubmit_block()
1029 {
1030     CRef<CParseNode> sblock(new CParseNode(eSubmitBlock, 0));
1031     sblock->m_Obj.Reset(static_cast<CObject*>(&*m_AF_Submit_block));
1032 
1033     for (auto* fix : *m_Fixes) {
1034         if (CanFixSubmit_block(*fix->m_Fix)) {
1035             m_NodeMap[&*fix->m_Fix] = sblock;
1036             CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1037         }
1038     }
1039 }
1040 
1041 
AutofixBioseq_set()1042 void CDiscrepancyContext::AutofixBioseq_set()
1043 {
1044     const CBioseq_set* bss = static_cast<const CBioseq_set*>(&*m_CurrentNode->m_Obj);
1045     if (bss->CanGetDescr() && bss->GetDescr().CanGet()) {
1046         for (auto& desc : bss->GetDescr().Get()) {
1047             m_CurrentNode->AddDescriptor(*desc);
1048         }
1049     }
1050     if (bss->IsSetAnnot()) {
1051         for (auto& annot : bss->GetAnnot()) {
1052             if (annot->IsFtable()) {
1053                 for (auto& feat : annot->GetData().GetFtable()) {
1054                     m_CurrentNode->AddFeature(*feat);
1055                 }
1056             }
1057         }
1058     }
1059 
1060     for (auto& se : bss->GetSeq_set()) {
1061         if (se->IsSet()) {
1062             PushNode(CDiscrepancyContext::eSeqSet);
1063             m_CurrentNode->m_Obj.Reset(&se->GetSet());
1064             AutofixBioseq_set();
1065         }
1066         else {
1067             PushNode(CDiscrepancyContext::eBioseq);
1068             m_CurrentNode->m_Obj.Reset(&se->GetSeq());
1069             AutofixBioseq();
1070         }
1071         PopNode();
1072     }
1073 
1074     for (auto* fix : *m_Fixes) {
1075         if (CanFixBioseq_set(*fix->m_Fix)) {
1076             m_NodeMap[&*fix->m_Fix] = m_CurrentNode;
1077             CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1078         }
1079     }
1080 }
1081 
1082 
AutofixBioseq()1083 void CDiscrepancyContext::AutofixBioseq()
1084 {
1085     const CBioseq* bs = static_cast<const CBioseq*>(&*m_CurrentNode->m_Obj);
1086     if (bs->CanGetDescr() && bs->GetDescr().CanGet()) {
1087         for (auto& desc : bs->GetDescr().Get()) {
1088             m_CurrentNode->AddDescriptor(*desc);
1089         }
1090     }
1091     if (bs->IsSetAnnot()) {
1092         for (auto& annot : bs->GetAnnot()) {
1093             if (annot->IsFtable()) {
1094                 for (auto& feat : annot->GetData().GetFtable()) {
1095                     m_CurrentNode->AddFeature(*feat);
1096                 }
1097             }
1098         }
1099     }
1100 
1101     for (auto* fix : *m_Fixes) {
1102         if (CanFixBioseq(*fix->m_Fix)) {
1103             m_NodeMap[&*fix->m_Fix] = m_CurrentNode;
1104             CRef<CAutofixReport> result = static_cast<CDiscrepancyCore&>(*fix->m_Case).Autofix(fix, *this);
1105         }
1106     }
1107 }
1108 
1109 
1110 END_SCOPE(NDiscrepancy)
1111 END_NCBI_SCOPE
1112